def stratified_fold(data_dir, annotation_file, n_folds=10, savefile=''): modes = fileOperations.getModeNames(data_dir) [filepaths, basefolders, filenames] = fileOperations.getFileNamesInDir(data_dir, extension='.pitch') with open(annotation_file, 'r') as a: annotations = json.load(a) filemodes = [os.path.basename(b) for b in basefolders] mbids = [os.path.splitext(f)[0] for f in filenames] tonics = [] for m in mbids: for a in annotations: if a['mbid'] == m: tonics.append(a['tonic']) # get the stratified folds mode_idx = [modes.index(m) for m in filemodes] skf = cross_validation.StratifiedKFold(mode_idx, n_folds=n_folds, shuffle=True) folds = dict() for ff, fold in enumerate(skf): folds['fold' + str(ff)] = {'train': [], 'test': []} for tr_idx in fold[0]: folds['fold' + str(ff)]['train'].append({'file': filepaths[tr_idx], 'mode': filemodes[tr_idx], 'tonic': tonics[tr_idx], 'mbid': mbids[tr_idx]}) for te_idx in fold[1]: folds['fold' + str(ff)]['test'].append({'file': filepaths[te_idx], 'mode': filemodes[te_idx], 'tonic': tonics[te_idx], 'mbid': mbids[te_idx]}) # save the folds to a file if specified if savefile: with open(savefile, 'w') as f: json.dump(folds, f, indent=2) return folds
from extras import foldGeneration from extras import fileOperations as fo import numpy as np from sklearn import cross_validation # I/O base_dir = '../../experiments/raag-recognition/' data_dir = os.path.join(base_dir,'data') experiments_dir = os.path.join(base_dir, 'experiments') modes = fo.getModeNames(data_dir) n_exp = 20 n_folds = 12 # get the data into appropriate format [pitch_paths, pitch_base, pitch_fname] = fo.getFileNamesInDir(data_dir, '.pitch') tonic_paths = [os.path.splitext(p)[0] + '.tonic' for p in pitch_paths] mode_labels = [] for p in pitch_base: for r in modes: if r in p: mode_labels.append(r) # make the data a single dictionary for housekeeping data = [] for p, f, t, r in zip(pitch_paths, pitch_fname, tonic_paths, mode_labels): data.append({'file':p, 'name':os.path.splitext(f)[0], 'tonic':float(np.loadtxt(t)), 'mode':r}) # create 20 stratified 12 fold mode_idx = [modes.index(m) for m in [d['mode'] for d in data]]