def extract_other_features(paths, path2gt, model_type):
    """Extracts MusiCNN or OpenL3 features and their corresponding ground_truth and identifiers (the path).

       OpenL3 features are extracted from non-overlapping audio patches of 1 second, 
       where each audio patch covers 128 mel bands.

       MusiCNN features are extracted from non-overlapping audio patches of 1 second, 
       where each audio patch covers 96 mel bands.

       We repeat ground_truth and identifiers to fit the number of extracted OpenL3 features.
    """

    if model_type == 'openl3':
        model = openl3.models.load_embedding_model(input_repr="mel128",
                                                   content_type="music",
                                                   embedding_size=512)

    first_audio = True
    for p in paths:
        if model_type == 'musicnn':
            taggram, tags, extracted_features = extractor(
                config['audio_folder'] + p,
                model='MSD_musicnn',
                extract_features=True,
                input_overlap=1)
            emb = extracted_features[
                'max_pool']  # or choose any other layer, for example: emb = taggram
            # Documentation: https://github.com/jordipons/musicnn/blob/master/DOCUMENTATION.md
        elif model_type == 'openl3':
            wave, sr = wavefile_to_waveform(config['audio_folder'] + p,
                                            'openl3')
            emb, _ = openl3.get_embedding(wave,
                                          sr,
                                          hop_size=1,
                                          model=model,
                                          verbose=False)

        if first_audio:
            features = emb
            ground_truth = np.repeat(path2gt[p], features.shape[0], axis=0)
            identifiers = np.repeat(p, features.shape[0], axis=0)
            first_audio = False
        else:
            features = np.concatenate((features, emb), axis=0)
            tmp_gt = np.repeat(path2gt[p], emb.shape[0], axis=0)
            ground_truth = np.concatenate((ground_truth, tmp_gt), axis=0)
            tmp_id = np.repeat(p, emb.shape[0], axis=0)
            identifiers = np.concatenate((identifiers, tmp_id), axis=0)

    return [features, ground_truth, identifiers]
Beispiel #2
0
def classify_audio(fname: str, model_path="../models/features_classifier.pkl"):
    # Load classifier
    clf = joblib.load(model_path)

    # Extract the `max_pool' features using musicnn
    _, _, features = extractor(fname,
                               model="MSD_musicnn",
                               input_overlap=1,
                               extract_features=True)

    maxpool_features = features["max_pool"]

    # Classify each feature with the trained classifier
    y_pred = clf.predict(maxpool_features)

    # Assign a string tag (e.g. 'classical' instead of 1) to each predicted class
    y_pred_labels = [idx2tag[n] for n in y_pred]

    # The assigned genre is the most common tag
    tag = Counter(y_pred_labels).most_common()[0][0]

    return tag
    def preprocess(self, musiccnn=False):
        debug_message = "preprocess" if musiccnn == False else "musiccnn preprocess"
        print(f'{debug_message} start')
        feature_name = '/spec/' if musiccnn == False else '/embed/'
        #Make dir for mel spec
        for genre in self.genres:
            os.makedirs(self.file_list_path + feature_name + genre,
                        exist_ok=True)
        path_out_list = []
        for path_in in tqdm(self.train_path_list + self.test_path_list):
            path_out = self.file_list_path + feature_name + path_in.replace(
                '.wav', '.npy')
            path_out_list.append(path_out)

            if os.path.isfile(path_out):
                print(f'{path_out} already exists')
                continue
            if musiccnn:
                _, _, embeds = extractor(
                    f'{self.file_list_path}/wav/{path_in}',
                    model='MSD_musicnn',
                    extract_features=True)
                embed = embeds['max_pool'].mean(axis=0)
                np.save(path_out, embed)
            else:
                signal, _ = librosa.load(
                    f'{self.file_list_path}/wav/{path_in}',
                    sr=self.sampling_rate)
                melspec = librosa.feature.melspectrogram(
                    signal,
                    sr=self.sampling_rate,
                    n_fft=self.number_fft,
                    hop_length=self.hop_length,
                    n_mels=self.number_mel)
                melspec = librosa.power_to_db(melspec)
                melspec = melspec.astype('float32')
                np.save(path_out, melspec)
        print(f'{debug_message} finish')
        return path_out_list
Beispiel #4
0
def extract_embeddings(in_base, out_base, model='MSD_musicnn'):
    # Make directories to save embeddings.
    for split in splits:
        out_dir = out_base + split
        os.makedirs(out_dir, exist_ok=True)

        for path_in in tqdm(glob(os.path.join(in_base, split, '*.wav'))):
            filename = path_in.split('/')[-1]
            path_out = os.path.join(out_dir, filename.replace('.wav', '.npy'))

            # Skip if the embeddings file already exists
            if os.path.isfile(path_out):
                continue

            # Extract the embedding using the pre-trained model.
            _, _, embeds = extractor(path_in,
                                     model=model,
                                     extract_features=True)
            # Average the embeddings over temporal dimension.
            embed = embeds['max_pool'].mean(axis=0)

            # Save the embedding.
            np.save(path_out, embed)
Beispiel #5
0
# 
# This notebook explains how to use the vgg models in `musicnn` as music feature extractors. `musicnn` allows you to extract features at every layer of the model. For this reason, we first present it – so that you can understand what to expect out of each layer. To start, let's consider this music clip:

# In[1]:


file_name = './audio/TRWJAZW128F42760DD_test.mp3'


# Run these two lines of code to extract music features with our vgg model trained with the [MagnaTagATune](https://github.com/keunwoochoi/magnatagatune-list) dataset – the `MTT_vgg` model:

# In[2]:


from musicnn.extractor import extractor
taggram, tags, features = extractor(file_name, model='MTT_vgg', extract_features=True)


# Out of the extractor, we get the **output** of the model (the `taggram` and its associated `tags`) and all the **intermediate representations** of it (we refer to those as `features`). The `features` are packed in a dictionary:

# In[3]:


list(features.keys())


# These different key-features correspond to the outputs of the different layers that our vgg model has. For this reason, it is important that you understand the basic bulding blocks of this model – that we briefly outline in the following diagram:
# 
# <br>
# <img src="./images/vgg.png">
# <br>
Beispiel #6
0
def plotOutDiagrams(songPath, modelUsed, subfolderName=False, show=False):
    # print("The Path is " + songPath)
    taggram, tags, somethingElse = extractor(songPath, model=modelUsed)

    if "SV.wav" == songPath and subfolderName:
        # songName = "SV_" + os.path.basename(os.path.dirname(songPath))
        # print("SubfolderName: " + subfolderName)
        # print("DirName: " + os.path.dirname(subfolderName))
        # print("BaseName: " + os.path.basename(subfolderName))
        songName = "SV_" + fixName(os.path.basename(subfolderName))
    else:
        songName = os.path.basename(songPath)[:-4]

    # print("The song name is " + songName)

    in_length = 3  # seconds  by default, the model takes inputs of 3 seconds with no overlap

    plt.rcParams["figure.figsize"] = (10, 8)  # set size of the figures
    fontsize = 12  # set figures font size

    fig, ax = plt.subplots()

    # title
    ax.title.set_text('Taggram ' + songName + " " + modelUsed)
    ax.title.set_fontsize(fontsize)

    # x-axis title
    ax.set_xlabel('(seconds)', fontsize=fontsize)

    # y-axis
    y_pos = np.arange(len(tags))
    ax.set_yticks(y_pos)
    ax.set_yticklabels(tags, fontsize=fontsize - 1)

    # x-axis
    x_pos = np.arange(taggram.shape[0])
    x_label = np.arange(in_length / 2, in_length * taggram.shape[0], 3)
    ax.set_xticks(x_pos)
    ax.set_xticklabels(x_label, fontsize=fontsize)

    # depict taggram
    ax.imshow(taggram.T, interpolation=None, aspect="auto")
    if show:
        plt.show()

    plt.savefig("Taggram " + songName + " _ " + modelUsed + ".png")

    ####

    tags_likelihood_mean = np.mean(
        taggram, axis=0)  # averaging the Taggram through time

    fig, ax = plt.subplots()

    # title
    ax.title.set_text('Tags likelihood (mean of the taggram) ' + songName)
    ax.title.set_fontsize(fontsize)

    # y-axis title
    ax.set_ylabel('(likelihood)', fontsize=fontsize)

    # y-axis
    ax.set_ylim((0, 1))
    ax.tick_params(axis="y", labelsize=fontsize)

    # x-axis
    ax.tick_params(axis="x", labelsize=fontsize - 1)
    pos = np.arange(len(tags))
    ax.set_xticks(pos)
    ax.set_xticklabels(tags, rotation=90)

    # depict song-level tags likelihood
    ax.bar(pos, tags_likelihood_mean)

    if show:
        plt.show()
    # plt.show()
    plt.savefig("Tags likelihood " + songName + " " + modelUsed + ".png")
    plt.close('all')

    return taggram, tags
Beispiel #7
0
    ax.title.set_text('Taggram')
    ax.title.set_fontsize(fontsize)

    # x-axis title
    ax.set_xlabel('(seconds)', fontsize=fontsize)

    # y-axis
    y_pos = np.arange(len(tags))
    ax.set_yticks(y_pos)
    ax.set_yticklabels(tags, fontsize=fontsize-2)

    # x-axis
    x_pos = np.arange(taggram.shape[0])
    x_label = np.arange(taggram.shape[0])
    ax.set_xticks(x_pos)
    ax.set_xticklabels(x_label, fontsize=4)

    plt.show()


print("Generating MSD tags...")
MSD_taggram, MSD_tags, MSD_features = extractor(input_path, model='MSD_musicnn', input_overlap=1, extract_features=True)
print("Generating MTT tags...")
MTT_taggram, MTT_tags, MTT_features = extractor(input_path, model='MTT_musicnn', input_overlap=1, extract_features=True)

taggram = np.concatenate((MSD_taggram, MTT_taggram), 1)
tags = np.concatenate((MSD_tags, MTT_tags))

showTags(taggram, tags)
exportJSON(taggram, tags, output_path)
Beispiel #8
0
        "Wrong usage. Correct example: `python3 recommend.py path_to_file.mp3`"
    )
    sys.exit(1)

# get the argument/path provided and check if it's .mp3
input_path = sys.argv[1]
if '.mp3' not in input_path:
    print("Wrong path format. The script has only been tested on .mp3 files")
    sys.exit(2)

# get the length of the audio
input_length = math.floor(MP3(input_path).info.length)

# extract features using MusicNN
input_embed, _, _ = extractor(input_path,
                              model='MTT_musicnn',
                              input_length=input_length,
                              extract_features=True)

# load in the training set and select the embedding part
data = pd.read_csv('data/train_set.csv')
embeddings = data.iloc[:, 1:].values

# compute the similarities between input_embed and embeddings
cos_sim = []
for i in range(embeddings.shape[0]):
    cos_sim.append(
        cosine_similarity(input_embed.tolist()[0], embeddings[i, :].tolist()))

# sort cosine similarities, get top_n of them and their audio paths
top_n = 3
idx = np.array(cos_sim).argsort()[::-1][:top_n]
def top_tags(file_name,
             model='MTT_musicnn',
             topN=3,
             input_length=3,
             input_overlap=False,
             print_tags=True,
             save_tags=False):
    ''' Predict the topN tags of the music-clip in file_name with the selected model.

    INPUT

    - file_name: path to the music file to tag.
    Data format: string.
    Example: './audio/TRWJAZW128F42760DD_test.mp3'
    
    - model: select a music audio tagging model.
    Data format: string.
    Options: 'MTT_musicnn', 'MTT_vgg', 'MSD_musicnn', 'MSD_musicnn_big' or 'MSD_vgg'.
    MTT models are trained with the MagnaTagATune dataset.
    MSD models are trained with the Million Song Dataset.
    To know more about these models, check our musicnn / vgg examples, and the FAQs.
    Important! 'MSD_musicnn_big' is only available if you install from source: python setup.py install.

    - topN: extract N most likely tags according to the selected model.
    Data format: integer.
    Example: 3
    
    - input_length: length (in seconds) of the input spectrogram patches. Set it small for real-time applications.
    Note: This is the length of the data that is going to be fed to the model. In other words, this parameter defines the temporal resolution of the taggram.
    Recommended value: 3, because the models were trained with 3 second inputs.
    Observation: the vgg models do not allow for different input lengths. For this reason, the vgg models' input_length needs to be set to 3. However, musicnn models allow for different input lengths: see this jupyter notebook.
    Data format: floating point number.
    Example: 3.1
    
    - input_overlap: ammount of overlap (in seconds) of the input spectrogram patches.
    Note: Set it considering the input_length.
    Data format: floating point number.
    Example: 1.0
    
    - print_tags: set it True for printing the tags.
    Note: although you don't print the tags, these will be returned by the musicnn.tagger.top_tags() function.
    Data format: boolean.
    Options: False (for NOT printing the tags), True (for printing the tags).
    
    - save_tags: Path where to store/save the tags.
    Data format: string.
    Example: 'file_name.tags'

    OUTPUT
    
    tags: topN most likely tags of the music-clip in file_name considering the selected model.
    Data format: list.
    Example: ['synth', 'techno']
    '''

    if 'vgg' in model and input_length != 3:
        raise ValueError(
            'Set input_length=3, the VGG models cannot handle different input lengths.'
        )

    taggram, tags = extractor(file_name,
                              model=model,
                              input_length=input_length,
                              input_overlap=input_overlap,
                              extract_features=False)
    tags_likelihood_mean = np.mean(taggram, axis=0)

    if print_tags:
        print('[' + file_name + '] Top' + str(topN) + ' tags: ')

    if save_tags:
        to = open(save_tags, 'a')
        to.write(file_name + ',' + model + ',input_length=' +
                 str(input_length) + ',input_overlap=' + str(input_overlap))

    topN_tags = []
    for tag_index in tags_likelihood_mean.argsort()[-topN:][::-1]:
        topN_tags.append(tags[tag_index])

        if print_tags:
            print(' - ' + tags[tag_index])

        if save_tags:
            to.write(',' + tags[tag_index])

    if save_tags:
        to.write('\n')
        to.close()

    return topN_tags
Beispiel #10
0
def analyse_list(parsedlist):
    track_tags_mtt = []
    track_compos_mtt = []
    track_tags_msd = []
    track_compos_msd = []
    for j, e in enumerate(parsedlist):
        fn = e[4]
        #top_tags_msd = top_tags(fn, model='MSD_musicnn', topN=10) ; top_tags_mtt = top_tags(fn, model='MTT_musicnn', topN=10)
        taggram_msd, tags_msd, feats_msd = extractor(fn,
                                                     model="MSD_musicnn",
                                                     extract_features=True)
        taggram_mtt, tags_mtt, feats_mtt = extractor(fn,
                                                     model="MTT_musicnn",
                                                     extract_features=True)
        # get genre tags and associated likelyhood
        means_mtt = taggram_mtt.mean(0)
        top_indexes_mtt = sorted(range(len(means_mtt)),
                                 key=lambda i: means_mtt[i],
                                 reverse=True)[:9]
        track_tags_mtt = [[tags_mtt[ix], '{:0.4f}'.format(means_mtt[ix])]
                          for ix in top_indexes_mtt]
        means_msd = taggram_msd.mean(0)
        top_indexes_msd = sorted(range(len(means_msd)),
                                 key=lambda i: means_msd[i],
                                 reverse=True)[:9]
        track_tags_msd = [[tags_msd[ix], '{:0.4f}'.format(means_msd[ix])]
                          for ix in top_indexes_msd]
        # get markovian temporal composition
        track_compos_mtt = [[
            tags_mtt[np.argmax(taggram_mtt[i])],
            '{:0.4f}'.format(taggram_mtt[i][np.argmax(taggram_mtt[i])])
        ] for i in range(len(taggram_mtt))]
        track_compos_msd = [[
            tags_msd[np.argmax(taggram_msd[i])],
            '{:0.4f}'.format(taggram_msd[i][np.argmax(taggram_msd[i])])
        ] for i in range(len(taggram_msd))]
        # get other metadata
        audio = AudioSegment.from_file(fn)
        a_dur = audio.duration_seconds
        a_fr = audio.frame_rate
        a_fc = audio.frame_count()
        a_ch = audio.channels
        a_wid = audio.sample_width
        a_max = audio.max
        a_maxdb = audio.max_dBFS
        a_rms = audio.rms
        # pack the data
        track_data = {
            'duration': '{:0.4f}'.format(a_dur),
            'sample_rate': str(a_fr),
            'sample_count': '{:0.2f}'.format(a_fc),
            'channels': str(a_ch),
            'bit_resolution': str(8 * a_wid),
            'max_volume': '{:0.4f}'.format(a_maxdb),
            'mean_rms': '{:0.4f}'.format(a_rms),
            'tags_mtt': track_tags_mtt,
            'tags_msd': track_tags_msd,
            'comp_mtt': track_compos_mtt,
            'comp_msd': track_compos_msd
        }
        e.append(track_data)
        print("\n\n\t\t\t -----", j)
    return parsedlist
Beispiel #11
0
from musicnn.tagger import top_tags

tags = top_tags(file_name, model='MTT_musicnn', topN=3)

# -----------------------------
# ### Are you interested in the temporal evolution of these tags?
#
# Instead of predicting song-level tags, you can also plot the **Taggram**:

# In[3]:

from musicnn.extractor import extractor

taggram, tags = extractor(file_name,
                          model='MTT_musicnn',
                          extract_features=False)

# In[4]:

get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import matplotlib.pyplot as plt

# In[5]:

in_length = 3  # seconds  by default, the model takes inputs of 3 seconds with no overlap

plt.rcParams["figure.figsize"] = (10, 8)  # set size of the figures
fontsize = 12  # set figures font size
Beispiel #12
0
def make_frames(audio_file):
    taggram, tags, feature_map = extractor(audio_file,
                                           model=MUSICNN_MODEL,
                                           input_length=MUSICNN_INPUT_LENGTH)
    print(f'Musicnn features: {feature_map.keys()}')
    feature_map['taggram'] = taggram

    song_features = feature_map[FEATURE_NAME]

    graph = tf.Graph()
    sess = tf.InteractiveSession(graph=graph)

    with tf.gfile.FastGFile(DEEP_DREAM_MODEL, "rb") as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())

    # define input
    X = tf.placeholder(tf.float32, name="input")
    X2 = tf.expand_dims(X - IMAGENET_MEAN, 0)
    tf.import_graph_def(graph_def, {"input": X2})

    losses = []
    targets = []
    layers = []
    num_features = 0
    for layer_name in LAYER_NAMES:
        layer = graph.get_tensor_by_name("import/%s:0" % layer_name)
        layers.append(layer)
        num_features += int(layer.shape[-1])
        print(f'Layer {layer_name}, shape {layer.shape}')
        target = tf.placeholder(tf.float32, name="target")
        targets.append(target)
        # loss = tf.reduce_mean(tf.sqrt(tf.square(layer - target)))
        loss = tf.reduce_mean(layer * target)
        losses.append(loss)

    loss = losses[0] * LAYER_WEIGHTS[0]
    for i in range(1, len(losses)):
        loss = loss + losses[i] * LAYER_WEIGHTS[i]

    gradient = tf.gradients(loss, X)[0]

    def make_frame(image):
        frame = cv2.resize(image, (OUTPUT_IMAGE_SIZE, OUTPUT_IMAGE_SIZE),
                           interpolation=cv2.INTER_CUBIC) / 255
        frame = np.clip(frame, 0, 1)
        frame = np.power(frame, GAMMA) * 255
        return frame

    image = np.full((OCTAVE_PARAMS[0][0], OCTAVE_PARAMS[0][0], 3),
                    IMAGENET_MEAN,
                    dtype=np.float32)
    frame_num = 0
    for fi in range(len(song_features)):
        target_values = []
        features = song_features[fi]
        scale = int(num_features / len(features) * 4)
        scale = scale if scale % 2 else scale + 1
        features = cv2.resize(np.tile(features, (scale, 1)),
                              (num_features, scale),
                              interpolation=cv2.INTER_LINEAR)[scale // 2]
        features = (features > FEATURE_THRESHOLD).astype(np.float32)
        print(
            f'Non-zero features {features.sum() / len(features) * 100:0.1f}%')
        mix_rng.shuffle(features)
        start = 0
        for l in range(len(layers)):
            layer = layers[l]
            target_size = int(layer.shape[3])
            t = features[start:start + target_size]
            start += target_size
            target_values.append(t)

        for oi in range(len(OCTAVE_PARAMS)):
            # l = sess.run(layer, {X: image})
            # print(f'size {image.shape} l shape {l.shape} l range {l.min()} {l.max()}')

            for batch in range(OCTAVE_PARAMS[oi][1]):
                args = {X: image}
                for t in range(len(targets)):
                    args[targets[t]] = target_values[t]
                g = sess.run(gradient, args)
                lr = OCTAVE_PARAMS[oi][2]
                image += lr * g / (np.abs(g).mean() + 1e-7)
                frame = make_frame(image)
                cv2.imwrite(os.path.join(TEMP_DIR, f'f-{frame_num:05d}.png'),
                            frame)
                frame_num += 1
                cv2.imshow(f'image', frame / 255)
                cv2.waitKey(1)
            if oi < len(OCTAVE_PARAMS) - 1:
                image = cv2.resize(
                    image,
                    (OCTAVE_PARAMS[oi + 1][0], OCTAVE_PARAMS[oi + 1][0]),
                    interpolation=cv2.INTER_CUBIC)

        downscaled = image
        for oi in range(len(OCTAVE_PARAMS) - 2, -1, -1):
            s = OCTAVE_PARAMS[oi][0]
            downscaled = cv2.resize(downscaled, (s, s),
                                    interpolation=cv2.INTER_CUBIC)
            frame = make_frame(downscaled)
            cv2.imwrite(os.path.join(TEMP_DIR, f'f-{frame_num:05d}.png'),
                        frame)
            frame_num += 1
            cv2.imshow(f'image', frame / 255)
            cv2.waitKey(100)

        global fps
        if fps is None:
            fps = int(np.round(frame_num / MUSICNN_INPUT_LENGTH))

        image = cv2.resize(image, (OCTAVE_PARAMS[0][0], OCTAVE_PARAMS[0][0]),
                           interpolation=cv2.INTER_CUBIC)
        image = (image - image.min()) / (image.max() - image.min()) * 255
from musicnn.extractor import extractor
from musicnn.tagger import top_tags
from mutagen.mp3 import MP3

audio_names = os.listdir('music/')
audio_paths = ['music/' + i for i in audio_names]

# creating a data set with MusicNN features
features = []
for audio_path in audio_paths:
    print('Extracting features for ' + audio_path)

    # check the length of the audio
    audio_length = math.floor(MP3(audio_path).info.length)

    # extract features using MusicNN
    taggram, tags, _ = extractor(audio_path,
                                 model='MTT_musicnn',
                                 input_length=audio_length,
                                 extract_features=True)
    print(taggram.shape)
    features.append(taggram)

features = np.vstack(features)

# create and save a table
df = pd.DataFrame(features, columns=tags)
df['audio_path'] = audio_paths
df = df[['audio_path'] + tags]  # rearrange columns
df.to_csv('data/train_set.csv', index=False)