def test(features, ndigits):

    indices = features.index[features.isnull().any(axis=1)]
    if len(indices) > 0:
        print('Failed tracks: {}'.format(', '.join(str(i) for i in indices)))

    # Failed features extraction should be due to files without audio.
    assert set(int(i) for i in indices) == set(fma.FILES_NO_AUDIO)

    tmp = fma.load('data/features.csv')
    np.testing.assert_allclose(tmp.values, features.values, rtol=10**-ndigits)
#!/usr/bin/env python3
"""Adapted from https://github.com/mdeff/fma/blob/master/baselines.ipynb"""

import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

import fma

y_train = pd.read_csv('data/train_labels.csv', index_col=0, squeeze=True)
features = fma.load('data/features.csv')
X_train = features[:25000]
X_test = features[25000:]

# Data cleaning

X_train = X_train.drop(fma.FAULTY_FILES)
y_train = y_train.drop(fma.FAULTY_FILES)

# The track IDs are integers for the training set.
X_train.index = pd.Index((int(i) for i in X_train.index), name='track_id')

# Should be done already, but better be sure.
X_train.sort_index(inplace=True)
X_test.sort_index(inplace=True)
y_train.sort_index(inplace=True)

assert (X_train.index == y_train.index).all()
#!/usr/bin/env python

import fma

tracks = fma.load('data/fma_metadata/tracks.csv')
subset = tracks.index[tracks['set', 'subset'] <= 'medium']
labels = tracks.loc[subset, ('track', 'genre_top')]
labels.name = 'genre'
labels.to_csv('data/train_labels.csv', header=True)