def get_best_polyrbf_params(aizko_svm, trainfeatsf, kernel, cgrid, paramgrid, workdir, expname, ntimes=3, stratified=False, rocarea_opt=False, svmargs=''): bestc = cgrid[0] bestp = paramgrid[0] nfolds = 2 rate = 0 rate_idx = 8 #brier-score if kernel == 1: suffix = '.poly' param = 'd' elif kernel == 2: suffix = '.rbf' param = 'g' if rocarea_opt: suffix += '.rocarea' else: suffix += '.errorrate' suffix += '.gridsearch' redoing = False f = open(trainfeatsf) data = f.readlines() nlines = len(data) f.close() if data[0][0] == '#': nlines -= 1 data = data[1:] data = np.array(data) for i in np.arange(ntimes): #create partitions if not stratified: partition = cvpartition(nlines, nfolds) else: partition = np.empty([nlines, nfolds], dtype=bool) for i in classes: gsiz = np.sum(testlabels == i) gcvpart = cvpartition(gsiz, nfolds) for f in np.arange(nfolds): partition[testlabels == i, f] = gcvpart[:, f] partition = np.bool_(partition) basefname = os.path.splitext(trainfeatsf)[:-1][0] [trainf, testf] = twofold_file_split(basefname, data, partition) for cval in cgrid: for pval in paramgrid: fail_count = 0 done = False while not done: texpname = expname + '_c' + str(cval) + '_' + param + str( pval) + suffix try: results = svm_polyrbf_test(aizko_svm, trainf, testf, texpname, workdir, cval, pval, redoing, rocarea_opt, svmargs) done = True except: log.debug('Failed. Repeating...') partition = cvpartition(nlines, 2) basefname = os.path.splitext(trainfeatsf)[:-1][0] [trainf, testf] = twofold_file_split(basefname, data, partition) fail_count += 1 if fail_count < 10: pass else: log.error('Unexpected error: ' + str(sys.exc_info())) log.debug('Failed too many times.') raise if done: log.debug(results) new_rate = results[rate_idx] if rate < new_rate: rate = new_rate bestc = cval bestp = pval remove_all(find(os.listdir(workdir), 'gridsearch'), workdir) return bestc, bestp
def get_best_c_param(aizko_svm, trainfeatsf, cgrid, workdir, expname, ntimes=3, stratified=False, rocarea_opt=False, svmargs=''): bestc = cgrid[0] rate = 0 nfolds = 2 #rate_idx = 8 #brier rate_idx = 0 #accuracy log.debug('Grid search optimization index: ' + str(rate_idx)) if rocarea_opt: suffix = '.linear.rocarea.gridsearch' else: suffix = '.linear.errorrate.gridsearch' redoing = False f = open(trainfeatsf) data = f.readlines() nlines = len(data) f.close() if data[0][0] == '#': nlines -= 1 data = data[1:] data = np.array(data) testlabels = read_labels_from_svmperf_file(trainfeatsf) classes = np.unique(testlabels) classnum = len(classes) for i in np.arange(ntimes): #create partitions if not stratified: partition = cvpartition(nlines, nfolds) else: partition = np.empty([nlines, nfolds], dtype=int) for i in classes: gsiz = np.sum(testlabels == i) gcvpart = cvpartition(gsiz, nfolds) for f in np.arange(nfolds): partition[testlabels == i, f] = gcvpart[:, f] partition = np.bool_(partition) basefname = os.path.splitext(trainfeatsf)[:-1][0] + '.' + expname [trainf, testf] = twofold_file_split(basefname, data, partition) #evaluate best parameter for cval in cgrid: fail_count = 0 done = False texpname = expname + '_c' + str(cval) + suffix while not done: try: results = svm_linear_test(aizko_svm, trainf, testf, texpname, workdir, cval, redoing, rocarea_opt, svmargs) done = True except: log.error('Unexpected error: ' + str(sys.exc_info())) log.debug('Failed. Repeating...') partition = cvpartition(nlines, 2) basefname = os.path.splitext(trainfeatsf)[:-1][0] [trainf, testf] = twofold_file_split(basefname, data, partition) fail_count += 1 if fail_count < 10: pass else: log.error('Unexpected error: ' + str(sys.exc_info())) log.debug('Failed too many times.') raise if done: log.debug(results) new_rate = results[rate_idx] if rate < new_rate: rate = new_rate bestc = cval remove_all(find(os.listdir(workdir), suffix), workdir) return bestc
def main(argv=None): parser = argparse.ArgumentParser(description='Creates text files with the same number of lines as the subjs file with 0s and 1s indicating which ones go to the training set (0) or test set(1)') parser.add_argument('-c','--classes', dest='classes', required=True, help='class label file. one line per class: <class_label>,<class_name>.') parser.add_argument('-s','--subjs', dest='subjs', required=True, help='list file with the subjects for the analysis. Each line: <class_label>,<subject_file>') parser.add_argument('-k','--folds', dest='folds', type=int, default=10, required=False, help='Number of folds to separate the data. Set to 0 if you want a leave-one-out.') parser.add_argument('-o','--out', dest='outdir', required=True, help='name of the output directory where the results will be put.') parser.add_argument('-b','--balanced', dest='balanced', default='1', choices=['1','0'], required=False, help='If 1 it will separate proportional number of subjects for each class, else it will randomly pick any subject from the list (default: 1)') args = parser.parse_args() classf = args.classes.strip() subjsf = args.subjs.strip() outdir = args.outdir.strip() folds = args.folds balanced = args.balanced.strip() #reading label file labels = [] classnames = [] labfile = open(classf, 'r') for l in labfile: line = l.strip().split(',') labels .append (int(line[0])) classnames.append (line[1]) labfile.close() labels = np.array (labels) classnames = np.array (classnames) #reading subjects list subjlabidx = [] subjs = [] subjfile = open(subjsf, 'r') for s in subjfile: line = s.strip().split(',') lab = int(line[0]) idx = np.where(labels == lab)[0] subjlabidx.append(idx[0]) subjs.append (line[1]) subjfile.close() #transforming from list to vector subjlabidx = np.array (subjlabidx) subjs = np.array (subjs) classnum = labels.size subjsnum = subjlabidx.size #if output dir does not exist, create if not(os.path.exists(outdir)): os.mkdir(outdir) #copying input files to outdir #shutil.copy (subjsf, outdir + os.path.sep + os.path.basename(subjsf)) #shutil.copy (classf, outdir + os.path.sep + os.path.basename(classf)) #saving the input files to the output folder #outf_subjs = outdir + os.path.sep + 'subjects' #outf_labels = outdir + os.path.sep + 'labels' #np.savetxt(outf_subjs, subjs, fmt='%s') #np.savetxt(outf_labels, subjlabels, fmt='%i') #generating partitions if balanced: #gsiz[i] has number of subjects of group with label in idx i #gsiz = np.empty(classnum, dtype=int) #gcvpart will have the partition for group i #cvparts will be iteratively filled with partition information for each group cvparts = np.empty([subjsnum, folds], dtype=int) for i in range(classnum): gsiz = sum(subjlabidx == i) gcvpart = cvpartition (gsiz, folds) for f in range(folds): cvparts[subjlabidx == i,f] = gcvpart[:,f] else: cvparts = cvpartition(subjsnum, folds) #generating files np.savetxt(outdir + '/all.txt', np.column_stack([labels[subjlabidx],subjs]), fmt='%s,%s') for i in range(folds): part = cvparts[:,i] fname = outdir + '/fold_' + str(i+1).zfill(4) + '.txt' f = open(fname, 'w') f.write ('#subjects file name: ' + subjsf) f.write ('\n') f.write ('#number of subjects: ' + str(len(part))) f.write ('\n') f.write ('#fold number: ' + str(i+1)) f.write ('\n') f.write ('#training set size: ' + str(sum(part==0))) f.write ('\n') f.write ('#training set label: 0') f.write ('\n') f.write ('#test set size: ' + str(sum(part==1))) f.write ('\n') f.write ('#test set label: 1') f.write ('\n') np.savetxt(f, part, fmt='%i') f.close() return 0
def get_best_polyrbf_params (aizko_svm, trainfeatsf, kernel, cgrid, paramgrid, workdir, expname, ntimes=3, stratified=False, rocarea_opt=False, svmargs=''): bestc = cgrid[0] bestp = paramgrid[0] nfolds = 2 rate = 0 rate_idx = 8 #brier-score if kernel == 1: suffix = '.poly' param = 'd' elif kernel == 2: suffix = '.rbf' param = 'g' if rocarea_opt: suffix += '.rocarea' else: suffix += '.errorrate' suffix += '.gridsearch' redoing = False f = open(trainfeatsf) data = f.readlines() nlines = len(data) f.close() if data[0][0] == '#': nlines -= 1 data = data[1:] data = np.array(data) for i in np.arange(ntimes): #create partitions if not stratified: partition = cvpartition (nlines, nfolds) else: partition = np.empty([nlines, nfolds], dtype=bool) for i in classes: gsiz = np.sum(testlabels == i) gcvpart = cvpartition (gsiz, nfolds) for f in np.arange(nfolds): partition[testlabels == i,f] = gcvpart[:,f] partition = np.bool_(partition) basefname = os.path.splitext(trainfeatsf)[:-1][0] [trainf, testf] = twofold_file_split (basefname, data, partition) for cval in cgrid: for pval in paramgrid: fail_count = 0 done = False while not done: texpname = expname + '_c' + str(cval) + '_' + param + str(pval) + suffix try: results = svm_polyrbf_test(aizko_svm, trainf, testf, texpname, workdir, cval, pval, redoing, rocarea_opt, svmargs) done = True except: log.debug ('Failed. Repeating...') partition = cvpartition (nlines, 2) basefname = os.path.splitext(trainfeatsf)[:-1][0] [trainf, testf] = twofold_file_split (basefname, data, partition) fail_count += 1 if fail_count < 10: pass else: log.error ('Unexpected error: ' + str(sys.exc_info())) log.debug ('Failed too many times.') raise if done: log.debug(results) new_rate = results[rate_idx] if rate < new_rate: rate = new_rate bestc = cval bestp = pval remove_all(find(os.listdir(workdir), 'gridsearch'), workdir) return bestc, bestp
def get_best_c_param (aizko_svm, trainfeatsf, cgrid, workdir, expname, ntimes=3, stratified=False, rocarea_opt=False, svmargs=''): bestc = cgrid[0] rate = 0 nfolds = 2 #rate_idx = 8 #brier rate_idx = 0 #accuracy log.debug('Grid search optimization index: ' + str(rate_idx)) if rocarea_opt: suffix = '.linear.rocarea.gridsearch' else: suffix = '.linear.errorrate.gridsearch' redoing = False f = open(trainfeatsf) data = f.readlines() nlines = len(data) f.close() if data[0][0] == '#': nlines -= 1 data = data[1:] data = np.array(data) testlabels = read_labels_from_svmperf_file (trainfeatsf) classes = np.unique(testlabels) classnum = len(classes) for i in np.arange(ntimes): #create partitions if not stratified: partition = cvpartition (nlines, nfolds) else: partition = np.empty([nlines, nfolds], dtype=int) for i in classes: gsiz = np.sum(testlabels == i) gcvpart = cvpartition (gsiz, nfolds) for f in np.arange(nfolds): partition[testlabels == i,f] = gcvpart[:,f] partition = np.bool_(partition) basefname = os.path.splitext(trainfeatsf)[:-1][0] + '.' + expname [trainf, testf] = twofold_file_split (basefname, data, partition) #evaluate best parameter for cval in cgrid: fail_count = 0 done = False texpname = expname + '_c' + str(cval) + suffix while not done: try: results = svm_linear_test (aizko_svm, trainf, testf, texpname, workdir, cval, redoing, rocarea_opt, svmargs) done = True except: log.error ('Unexpected error: ' + str(sys.exc_info())) log.debug ('Failed. Repeating...') partition = cvpartition (nlines, 2) basefname = os.path.splitext(trainfeatsf)[:-1][0] [trainf, testf] = twofold_file_split (basefname, data, partition) fail_count += 1 if fail_count < 10: pass else: log.error ('Unexpected error: ' + str(sys.exc_info())) log.debug ('Failed too many times.') raise if done: log.debug (results) new_rate = results[rate_idx] if rate < new_rate: rate = new_rate bestc = cval remove_all(find(os.listdir(workdir), suffix), workdir) return bestc
def main(argv=None): parser = argparse.ArgumentParser( description= 'Creates text files with the same number of lines as the subjs file with 0s and 1s indicating which ones go to the training set (0) or test set(1)' ) parser.add_argument( '-c', '--classes', dest='classes', required=True, help='class label file. one line per class: <class_label>,<class_name>.' ) parser.add_argument( '-s', '--subjs', dest='subjs', required=True, help= 'list file with the subjects for the analysis. Each line: <class_label>,<subject_file>' ) parser.add_argument( '-k', '--folds', dest='folds', type=int, default=10, required=False, help= 'Number of folds to separate the data. Set to 0 if you want a leave-one-out.' ) parser.add_argument( '-o', '--out', dest='outdir', required=True, help='name of the output directory where the results will be put.') parser.add_argument( '-b', '--balanced', dest='balanced', default='1', choices=['1', '0'], required=False, help= 'If 1 it will separate proportional number of subjects for each class, else it will randomly pick any subject from the list (default: 1)' ) args = parser.parse_args() classf = args.classes.strip() subjsf = args.subjs.strip() outdir = args.outdir.strip() folds = args.folds balanced = args.balanced.strip() #reading label file labels = [] classnames = [] labfile = open(classf, 'r') for l in labfile: line = l.strip().split(',') labels.append(int(line[0])) classnames.append(line[1]) labfile.close() labels = np.array(labels) classnames = np.array(classnames) #reading subjects list subjlabidx = [] subjs = [] subjfile = open(subjsf, 'r') for s in subjfile: line = s.strip().split(',') lab = int(line[0]) idx = np.where(labels == lab)[0] subjlabidx.append(idx[0]) subjs.append(line[1]) subjfile.close() #transforming from list to vector subjlabidx = np.array(subjlabidx) subjs = np.array(subjs) classnum = labels.size subjsnum = subjlabidx.size #if output dir does not exist, create if not (os.path.exists(outdir)): os.mkdir(outdir) #copying input files to outdir #shutil.copy (subjsf, outdir + os.path.sep + os.path.basename(subjsf)) #shutil.copy (classf, outdir + os.path.sep + os.path.basename(classf)) #saving the input files to the output folder #outf_subjs = outdir + os.path.sep + 'subjects' #outf_labels = outdir + os.path.sep + 'labels' #np.savetxt(outf_subjs, subjs, fmt='%s') #np.savetxt(outf_labels, subjlabels, fmt='%i') #generating partitions if balanced: #gsiz[i] has number of subjects of group with label in idx i #gsiz = np.empty(classnum, dtype=int) #gcvpart will have the partition for group i #cvparts will be iteratively filled with partition information for each group cvparts = np.empty([subjsnum, folds], dtype=int) for i in range(classnum): gsiz = sum(subjlabidx == i) gcvpart = cvpartition(gsiz, folds) for f in range(folds): cvparts[subjlabidx == i, f] = gcvpart[:, f] else: cvparts = cvpartition(subjsnum, folds) #generating files np.savetxt(outdir + '/all.txt', np.column_stack([labels[subjlabidx], subjs]), fmt='%s,%s') for i in range(folds): part = cvparts[:, i] fname = outdir + '/fold_' + str(i + 1).zfill(4) + '.txt' f = open(fname, 'w') f.write('#subjects file name: ' + subjsf) f.write('\n') f.write('#number of subjects: ' + str(len(part))) f.write('\n') f.write('#fold number: ' + str(i + 1)) f.write('\n') f.write('#training set size: ' + str(sum(part == 0))) f.write('\n') f.write('#training set label: 0') f.write('\n') f.write('#test set size: ' + str(sum(part == 1))) f.write('\n') f.write('#test set label: 1') f.write('\n') np.savetxt(f, part, fmt='%i') f.close() return 0