def extract_other_features(paths, path2gt, model_type): """Extracts MusiCNN or OpenL3 features and their corresponding ground_truth and identifiers (the path). OpenL3 features are extracted from non-overlapping audio patches of 1 second, where each audio patch covers 128 mel bands. MusiCNN features are extracted from non-overlapping audio patches of 1 second, where each audio patch covers 96 mel bands. We repeat ground_truth and identifiers to fit the number of extracted OpenL3 features. """ if model_type == 'openl3': model = openl3.models.load_embedding_model(input_repr="mel128", content_type="music", embedding_size=512) first_audio = True for p in paths: if model_type == 'musicnn': taggram, tags, extracted_features = extractor( config['audio_folder'] + p, model='MSD_musicnn', extract_features=True, input_overlap=1) emb = extracted_features[ 'max_pool'] # or choose any other layer, for example: emb = taggram # Documentation: https://github.com/jordipons/musicnn/blob/master/DOCUMENTATION.md elif model_type == 'openl3': wave, sr = wavefile_to_waveform(config['audio_folder'] + p, 'openl3') emb, _ = openl3.get_embedding(wave, sr, hop_size=1, model=model, verbose=False) if first_audio: features = emb ground_truth = np.repeat(path2gt[p], features.shape[0], axis=0) identifiers = np.repeat(p, features.shape[0], axis=0) first_audio = False else: features = np.concatenate((features, emb), axis=0) tmp_gt = np.repeat(path2gt[p], emb.shape[0], axis=0) ground_truth = np.concatenate((ground_truth, tmp_gt), axis=0) tmp_id = np.repeat(p, emb.shape[0], axis=0) identifiers = np.concatenate((identifiers, tmp_id), axis=0) return [features, ground_truth, identifiers]
def classify_audio(fname: str, model_path="../models/features_classifier.pkl"): # Load classifier clf = joblib.load(model_path) # Extract the `max_pool' features using musicnn _, _, features = extractor(fname, model="MSD_musicnn", input_overlap=1, extract_features=True) maxpool_features = features["max_pool"] # Classify each feature with the trained classifier y_pred = clf.predict(maxpool_features) # Assign a string tag (e.g. 'classical' instead of 1) to each predicted class y_pred_labels = [idx2tag[n] for n in y_pred] # The assigned genre is the most common tag tag = Counter(y_pred_labels).most_common()[0][0] return tag
def preprocess(self, musiccnn=False): debug_message = "preprocess" if musiccnn == False else "musiccnn preprocess" print(f'{debug_message} start') feature_name = '/spec/' if musiccnn == False else '/embed/' #Make dir for mel spec for genre in self.genres: os.makedirs(self.file_list_path + feature_name + genre, exist_ok=True) path_out_list = [] for path_in in tqdm(self.train_path_list + self.test_path_list): path_out = self.file_list_path + feature_name + path_in.replace( '.wav', '.npy') path_out_list.append(path_out) if os.path.isfile(path_out): print(f'{path_out} already exists') continue if musiccnn: _, _, embeds = extractor( f'{self.file_list_path}/wav/{path_in}', model='MSD_musicnn', extract_features=True) embed = embeds['max_pool'].mean(axis=0) np.save(path_out, embed) else: signal, _ = librosa.load( f'{self.file_list_path}/wav/{path_in}', sr=self.sampling_rate) melspec = librosa.feature.melspectrogram( signal, sr=self.sampling_rate, n_fft=self.number_fft, hop_length=self.hop_length, n_mels=self.number_mel) melspec = librosa.power_to_db(melspec) melspec = melspec.astype('float32') np.save(path_out, melspec) print(f'{debug_message} finish') return path_out_list
def extract_embeddings(in_base, out_base, model='MSD_musicnn'): # Make directories to save embeddings. for split in splits: out_dir = out_base + split os.makedirs(out_dir, exist_ok=True) for path_in in tqdm(glob(os.path.join(in_base, split, '*.wav'))): filename = path_in.split('/')[-1] path_out = os.path.join(out_dir, filename.replace('.wav', '.npy')) # Skip if the embeddings file already exists if os.path.isfile(path_out): continue # Extract the embedding using the pre-trained model. _, _, embeds = extractor(path_in, model=model, extract_features=True) # Average the embeddings over temporal dimension. embed = embeds['max_pool'].mean(axis=0) # Save the embedding. np.save(path_out, embed)
# # This notebook explains how to use the vgg models in `musicnn` as music feature extractors. `musicnn` allows you to extract features at every layer of the model. For this reason, we first present it – so that you can understand what to expect out of each layer. To start, let's consider this music clip: # In[1]: file_name = './audio/TRWJAZW128F42760DD_test.mp3' # Run these two lines of code to extract music features with our vgg model trained with the [MagnaTagATune](https://github.com/keunwoochoi/magnatagatune-list) dataset – the `MTT_vgg` model: # In[2]: from musicnn.extractor import extractor taggram, tags, features = extractor(file_name, model='MTT_vgg', extract_features=True) # Out of the extractor, we get the **output** of the model (the `taggram` and its associated `tags`) and all the **intermediate representations** of it (we refer to those as `features`). The `features` are packed in a dictionary: # In[3]: list(features.keys()) # These different key-features correspond to the outputs of the different layers that our vgg model has. For this reason, it is important that you understand the basic bulding blocks of this model – that we briefly outline in the following diagram: # # <br> # <img src="./images/vgg.png"> # <br>
def plotOutDiagrams(songPath, modelUsed, subfolderName=False, show=False): # print("The Path is " + songPath) taggram, tags, somethingElse = extractor(songPath, model=modelUsed) if "SV.wav" == songPath and subfolderName: # songName = "SV_" + os.path.basename(os.path.dirname(songPath)) # print("SubfolderName: " + subfolderName) # print("DirName: " + os.path.dirname(subfolderName)) # print("BaseName: " + os.path.basename(subfolderName)) songName = "SV_" + fixName(os.path.basename(subfolderName)) else: songName = os.path.basename(songPath)[:-4] # print("The song name is " + songName) in_length = 3 # seconds by default, the model takes inputs of 3 seconds with no overlap plt.rcParams["figure.figsize"] = (10, 8) # set size of the figures fontsize = 12 # set figures font size fig, ax = plt.subplots() # title ax.title.set_text('Taggram ' + songName + " " + modelUsed) ax.title.set_fontsize(fontsize) # x-axis title ax.set_xlabel('(seconds)', fontsize=fontsize) # y-axis y_pos = np.arange(len(tags)) ax.set_yticks(y_pos) ax.set_yticklabels(tags, fontsize=fontsize - 1) # x-axis x_pos = np.arange(taggram.shape[0]) x_label = np.arange(in_length / 2, in_length * taggram.shape[0], 3) ax.set_xticks(x_pos) ax.set_xticklabels(x_label, fontsize=fontsize) # depict taggram ax.imshow(taggram.T, interpolation=None, aspect="auto") if show: plt.show() plt.savefig("Taggram " + songName + " _ " + modelUsed + ".png") #### tags_likelihood_mean = np.mean( taggram, axis=0) # averaging the Taggram through time fig, ax = plt.subplots() # title ax.title.set_text('Tags likelihood (mean of the taggram) ' + songName) ax.title.set_fontsize(fontsize) # y-axis title ax.set_ylabel('(likelihood)', fontsize=fontsize) # y-axis ax.set_ylim((0, 1)) ax.tick_params(axis="y", labelsize=fontsize) # x-axis ax.tick_params(axis="x", labelsize=fontsize - 1) pos = np.arange(len(tags)) ax.set_xticks(pos) ax.set_xticklabels(tags, rotation=90) # depict song-level tags likelihood ax.bar(pos, tags_likelihood_mean) if show: plt.show() # plt.show() plt.savefig("Tags likelihood " + songName + " " + modelUsed + ".png") plt.close('all') return taggram, tags
ax.title.set_text('Taggram') ax.title.set_fontsize(fontsize) # x-axis title ax.set_xlabel('(seconds)', fontsize=fontsize) # y-axis y_pos = np.arange(len(tags)) ax.set_yticks(y_pos) ax.set_yticklabels(tags, fontsize=fontsize-2) # x-axis x_pos = np.arange(taggram.shape[0]) x_label = np.arange(taggram.shape[0]) ax.set_xticks(x_pos) ax.set_xticklabels(x_label, fontsize=4) plt.show() print("Generating MSD tags...") MSD_taggram, MSD_tags, MSD_features = extractor(input_path, model='MSD_musicnn', input_overlap=1, extract_features=True) print("Generating MTT tags...") MTT_taggram, MTT_tags, MTT_features = extractor(input_path, model='MTT_musicnn', input_overlap=1, extract_features=True) taggram = np.concatenate((MSD_taggram, MTT_taggram), 1) tags = np.concatenate((MSD_tags, MTT_tags)) showTags(taggram, tags) exportJSON(taggram, tags, output_path)
"Wrong usage. Correct example: `python3 recommend.py path_to_file.mp3`" ) sys.exit(1) # get the argument/path provided and check if it's .mp3 input_path = sys.argv[1] if '.mp3' not in input_path: print("Wrong path format. The script has only been tested on .mp3 files") sys.exit(2) # get the length of the audio input_length = math.floor(MP3(input_path).info.length) # extract features using MusicNN input_embed, _, _ = extractor(input_path, model='MTT_musicnn', input_length=input_length, extract_features=True) # load in the training set and select the embedding part data = pd.read_csv('data/train_set.csv') embeddings = data.iloc[:, 1:].values # compute the similarities between input_embed and embeddings cos_sim = [] for i in range(embeddings.shape[0]): cos_sim.append( cosine_similarity(input_embed.tolist()[0], embeddings[i, :].tolist())) # sort cosine similarities, get top_n of them and their audio paths top_n = 3 idx = np.array(cos_sim).argsort()[::-1][:top_n]
def top_tags(file_name, model='MTT_musicnn', topN=3, input_length=3, input_overlap=False, print_tags=True, save_tags=False): ''' Predict the topN tags of the music-clip in file_name with the selected model. INPUT - file_name: path to the music file to tag. Data format: string. Example: './audio/TRWJAZW128F42760DD_test.mp3' - model: select a music audio tagging model. Data format: string. Options: 'MTT_musicnn', 'MTT_vgg', 'MSD_musicnn', 'MSD_musicnn_big' or 'MSD_vgg'. MTT models are trained with the MagnaTagATune dataset. MSD models are trained with the Million Song Dataset. To know more about these models, check our musicnn / vgg examples, and the FAQs. Important! 'MSD_musicnn_big' is only available if you install from source: python setup.py install. - topN: extract N most likely tags according to the selected model. Data format: integer. Example: 3 - input_length: length (in seconds) of the input spectrogram patches. Set it small for real-time applications. Note: This is the length of the data that is going to be fed to the model. In other words, this parameter defines the temporal resolution of the taggram. Recommended value: 3, because the models were trained with 3 second inputs. Observation: the vgg models do not allow for different input lengths. For this reason, the vgg models' input_length needs to be set to 3. However, musicnn models allow for different input lengths: see this jupyter notebook. Data format: floating point number. Example: 3.1 - input_overlap: ammount of overlap (in seconds) of the input spectrogram patches. Note: Set it considering the input_length. Data format: floating point number. Example: 1.0 - print_tags: set it True for printing the tags. Note: although you don't print the tags, these will be returned by the musicnn.tagger.top_tags() function. Data format: boolean. Options: False (for NOT printing the tags), True (for printing the tags). - save_tags: Path where to store/save the tags. Data format: string. Example: 'file_name.tags' OUTPUT tags: topN most likely tags of the music-clip in file_name considering the selected model. Data format: list. Example: ['synth', 'techno'] ''' if 'vgg' in model and input_length != 3: raise ValueError( 'Set input_length=3, the VGG models cannot handle different input lengths.' ) taggram, tags = extractor(file_name, model=model, input_length=input_length, input_overlap=input_overlap, extract_features=False) tags_likelihood_mean = np.mean(taggram, axis=0) if print_tags: print('[' + file_name + '] Top' + str(topN) + ' tags: ') if save_tags: to = open(save_tags, 'a') to.write(file_name + ',' + model + ',input_length=' + str(input_length) + ',input_overlap=' + str(input_overlap)) topN_tags = [] for tag_index in tags_likelihood_mean.argsort()[-topN:][::-1]: topN_tags.append(tags[tag_index]) if print_tags: print(' - ' + tags[tag_index]) if save_tags: to.write(',' + tags[tag_index]) if save_tags: to.write('\n') to.close() return topN_tags
def analyse_list(parsedlist): track_tags_mtt = [] track_compos_mtt = [] track_tags_msd = [] track_compos_msd = [] for j, e in enumerate(parsedlist): fn = e[4] #top_tags_msd = top_tags(fn, model='MSD_musicnn', topN=10) ; top_tags_mtt = top_tags(fn, model='MTT_musicnn', topN=10) taggram_msd, tags_msd, feats_msd = extractor(fn, model="MSD_musicnn", extract_features=True) taggram_mtt, tags_mtt, feats_mtt = extractor(fn, model="MTT_musicnn", extract_features=True) # get genre tags and associated likelyhood means_mtt = taggram_mtt.mean(0) top_indexes_mtt = sorted(range(len(means_mtt)), key=lambda i: means_mtt[i], reverse=True)[:9] track_tags_mtt = [[tags_mtt[ix], '{:0.4f}'.format(means_mtt[ix])] for ix in top_indexes_mtt] means_msd = taggram_msd.mean(0) top_indexes_msd = sorted(range(len(means_msd)), key=lambda i: means_msd[i], reverse=True)[:9] track_tags_msd = [[tags_msd[ix], '{:0.4f}'.format(means_msd[ix])] for ix in top_indexes_msd] # get markovian temporal composition track_compos_mtt = [[ tags_mtt[np.argmax(taggram_mtt[i])], '{:0.4f}'.format(taggram_mtt[i][np.argmax(taggram_mtt[i])]) ] for i in range(len(taggram_mtt))] track_compos_msd = [[ tags_msd[np.argmax(taggram_msd[i])], '{:0.4f}'.format(taggram_msd[i][np.argmax(taggram_msd[i])]) ] for i in range(len(taggram_msd))] # get other metadata audio = AudioSegment.from_file(fn) a_dur = audio.duration_seconds a_fr = audio.frame_rate a_fc = audio.frame_count() a_ch = audio.channels a_wid = audio.sample_width a_max = audio.max a_maxdb = audio.max_dBFS a_rms = audio.rms # pack the data track_data = { 'duration': '{:0.4f}'.format(a_dur), 'sample_rate': str(a_fr), 'sample_count': '{:0.2f}'.format(a_fc), 'channels': str(a_ch), 'bit_resolution': str(8 * a_wid), 'max_volume': '{:0.4f}'.format(a_maxdb), 'mean_rms': '{:0.4f}'.format(a_rms), 'tags_mtt': track_tags_mtt, 'tags_msd': track_tags_msd, 'comp_mtt': track_compos_mtt, 'comp_msd': track_compos_msd } e.append(track_data) print("\n\n\t\t\t -----", j) return parsedlist
from musicnn.tagger import top_tags tags = top_tags(file_name, model='MTT_musicnn', topN=3) # ----------------------------- # ### Are you interested in the temporal evolution of these tags? # # Instead of predicting song-level tags, you can also plot the **Taggram**: # In[3]: from musicnn.extractor import extractor taggram, tags = extractor(file_name, model='MTT_musicnn', extract_features=False) # In[4]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import matplotlib.pyplot as plt # In[5]: in_length = 3 # seconds by default, the model takes inputs of 3 seconds with no overlap plt.rcParams["figure.figsize"] = (10, 8) # set size of the figures fontsize = 12 # set figures font size
def make_frames(audio_file): taggram, tags, feature_map = extractor(audio_file, model=MUSICNN_MODEL, input_length=MUSICNN_INPUT_LENGTH) print(f'Musicnn features: {feature_map.keys()}') feature_map['taggram'] = taggram song_features = feature_map[FEATURE_NAME] graph = tf.Graph() sess = tf.InteractiveSession(graph=graph) with tf.gfile.FastGFile(DEEP_DREAM_MODEL, "rb") as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) # define input X = tf.placeholder(tf.float32, name="input") X2 = tf.expand_dims(X - IMAGENET_MEAN, 0) tf.import_graph_def(graph_def, {"input": X2}) losses = [] targets = [] layers = [] num_features = 0 for layer_name in LAYER_NAMES: layer = graph.get_tensor_by_name("import/%s:0" % layer_name) layers.append(layer) num_features += int(layer.shape[-1]) print(f'Layer {layer_name}, shape {layer.shape}') target = tf.placeholder(tf.float32, name="target") targets.append(target) # loss = tf.reduce_mean(tf.sqrt(tf.square(layer - target))) loss = tf.reduce_mean(layer * target) losses.append(loss) loss = losses[0] * LAYER_WEIGHTS[0] for i in range(1, len(losses)): loss = loss + losses[i] * LAYER_WEIGHTS[i] gradient = tf.gradients(loss, X)[0] def make_frame(image): frame = cv2.resize(image, (OUTPUT_IMAGE_SIZE, OUTPUT_IMAGE_SIZE), interpolation=cv2.INTER_CUBIC) / 255 frame = np.clip(frame, 0, 1) frame = np.power(frame, GAMMA) * 255 return frame image = np.full((OCTAVE_PARAMS[0][0], OCTAVE_PARAMS[0][0], 3), IMAGENET_MEAN, dtype=np.float32) frame_num = 0 for fi in range(len(song_features)): target_values = [] features = song_features[fi] scale = int(num_features / len(features) * 4) scale = scale if scale % 2 else scale + 1 features = cv2.resize(np.tile(features, (scale, 1)), (num_features, scale), interpolation=cv2.INTER_LINEAR)[scale // 2] features = (features > FEATURE_THRESHOLD).astype(np.float32) print( f'Non-zero features {features.sum() / len(features) * 100:0.1f}%') mix_rng.shuffle(features) start = 0 for l in range(len(layers)): layer = layers[l] target_size = int(layer.shape[3]) t = features[start:start + target_size] start += target_size target_values.append(t) for oi in range(len(OCTAVE_PARAMS)): # l = sess.run(layer, {X: image}) # print(f'size {image.shape} l shape {l.shape} l range {l.min()} {l.max()}') for batch in range(OCTAVE_PARAMS[oi][1]): args = {X: image} for t in range(len(targets)): args[targets[t]] = target_values[t] g = sess.run(gradient, args) lr = OCTAVE_PARAMS[oi][2] image += lr * g / (np.abs(g).mean() + 1e-7) frame = make_frame(image) cv2.imwrite(os.path.join(TEMP_DIR, f'f-{frame_num:05d}.png'), frame) frame_num += 1 cv2.imshow(f'image', frame / 255) cv2.waitKey(1) if oi < len(OCTAVE_PARAMS) - 1: image = cv2.resize( image, (OCTAVE_PARAMS[oi + 1][0], OCTAVE_PARAMS[oi + 1][0]), interpolation=cv2.INTER_CUBIC) downscaled = image for oi in range(len(OCTAVE_PARAMS) - 2, -1, -1): s = OCTAVE_PARAMS[oi][0] downscaled = cv2.resize(downscaled, (s, s), interpolation=cv2.INTER_CUBIC) frame = make_frame(downscaled) cv2.imwrite(os.path.join(TEMP_DIR, f'f-{frame_num:05d}.png'), frame) frame_num += 1 cv2.imshow(f'image', frame / 255) cv2.waitKey(100) global fps if fps is None: fps = int(np.round(frame_num / MUSICNN_INPUT_LENGTH)) image = cv2.resize(image, (OCTAVE_PARAMS[0][0], OCTAVE_PARAMS[0][0]), interpolation=cv2.INTER_CUBIC) image = (image - image.min()) / (image.max() - image.min()) * 255
from musicnn.extractor import extractor from musicnn.tagger import top_tags from mutagen.mp3 import MP3 audio_names = os.listdir('music/') audio_paths = ['music/' + i for i in audio_names] # creating a data set with MusicNN features features = [] for audio_path in audio_paths: print('Extracting features for ' + audio_path) # check the length of the audio audio_length = math.floor(MP3(audio_path).info.length) # extract features using MusicNN taggram, tags, _ = extractor(audio_path, model='MTT_musicnn', input_length=audio_length, extract_features=True) print(taggram.shape) features.append(taggram) features = np.vstack(features) # create and save a table df = pd.DataFrame(features, columns=tags) df['audio_path'] = audio_paths df = df[['audio_path'] + tags] # rearrange columns df.to_csv('data/train_set.csv', index=False)