def stratified_fold(data_dir, annotation_file, n_folds=10, savefile=''):
	modes = fileOperations.getModeNames(data_dir)
	[filepaths, basefolders, filenames] = fileOperations.getFileNamesInDir(data_dir, extension='.pitch')

	with open(annotation_file, 'r') as a:
		annotations = json.load(a)

	filemodes = [os.path.basename(b) for b in basefolders]
	mbids = [os.path.splitext(f)[0] for f in filenames]
	tonics = []
	for m in mbids:
		for a in annotations:
			if a['mbid'] == m:
				tonics.append(a['tonic'])

	# get the stratified folds
	mode_idx = [modes.index(m) for m in filemodes]
	skf = cross_validation.StratifiedKFold(mode_idx, n_folds=n_folds, shuffle=True)

	folds = dict()
	for ff, fold in enumerate(skf):
		folds['fold' + str(ff)] = {'train': [], 'test': []}
		for tr_idx in fold[0]:
			folds['fold' + str(ff)]['train'].append({'file': filepaths[tr_idx], 'mode': filemodes[tr_idx],
			                                         'tonic': tonics[tr_idx], 'mbid': mbids[tr_idx]})
		for te_idx in fold[1]:
			folds['fold' + str(ff)]['test'].append({'file': filepaths[te_idx], 'mode': filemodes[te_idx],
			                                         'tonic': tonics[te_idx], 'mbid': mbids[te_idx]})

	# save the folds to a file if specified
	if savefile:
		with open(savefile, 'w') as f:
			json.dump(folds, f, indent=2)

	return folds
Ejemplo n.º 2
0
from extras import foldGeneration
from extras import fileOperations as fo
import numpy as np
from sklearn import cross_validation

# I/O
base_dir = '../../experiments/raag-recognition/'
data_dir = os.path.join(base_dir,'data')
experiments_dir = os.path.join(base_dir, 'experiments')
modes = fo.getModeNames(data_dir)

n_exp = 20
n_folds = 12

# get the data into appropriate format
[pitch_paths, pitch_base, pitch_fname] = fo.getFileNamesInDir(data_dir, '.pitch')
tonic_paths = [os.path.splitext(p)[0] + '.tonic' for p in pitch_paths]
mode_labels = []
for p in pitch_base:
    for r in modes:
        if r in p:
            mode_labels.append(r)

# make the data a single dictionary for housekeeping
data = []
for p, f, t, r in zip(pitch_paths, pitch_fname, tonic_paths, mode_labels):
    data.append({'file':p, 'name':os.path.splitext(f)[0],
               'tonic':float(np.loadtxt(t)), 'mode':r})

# create 20 stratified 12 fold 
mode_idx = [modes.index(m) for m in [d['mode'] for d in data]]