(train_data_normalized, trainShiftFactor, trainScaleFactor) = normalizeAcrossEpoch(train_bp, 'MinMax') if True: train_data_downsampled = train_data_normalized[:, :, :: decim_factor] train_reshaped = train_data_downsampled.reshape( train_data_downsampled.shape[0], -1) # merge channel and time for the pca pca = PCA(0.95) pca.fit(train_reshaped) pca.components_ = -pca.components_ train_pcaed = pca.transform(train_reshaped) # train classifier cls = rLDA(regcoeff) cls.fit(train_pcaed, label) if False: pca = None X = compute_features(train_data_normalized, sfreq, l_freq, h_freq, decim_factor, trainShiftFactor, trainScaleFactor, pca) # Classifier init RF = dict(trees=100, maxdepth=None) cls = RandomForestClassifier(n_estimators=RF['trees'], max_features='auto', max_depth=RF['maxdepth'], n_jobs=n_jobs) cls.fit(X, label) flen = X.shape[1]
def apply_cv(epochs): count = 1 confusion_matrixes = [] confusion_matrixes_percent = [] predicted = '' test_label = '' firstIterCV = True probabilities = np.array([[]], ndmin=2) predictions = np.array([]) best_threshold = [] cv_probabilities = [] cv_probabilities_label = [] for train, test in cv: ## Train Data processing ## train_data = epochs._data[train] train_label = label[train] # Online simulation flag if FILTER_METHOD is 'WINDOWED': # epochs should have one epoch only train_bp = mne.filter.band_pass_filter( train_data, sfreq, Fp1=2, Fp2=h_freq, copy=True, filter_length=None, method='fft', iir_params=None) # bandpass on one epoch if FILTER_METHOD is 'NC' or FILTER_METHOD is 'LFILT': train_bp = train_data train_bp = train_bp[:, :, paddingIdx:paddingIdx + (int((tmax - tmin) * sfreq))] for trial in range(train_bp.shape[0]): for ch in range(train_bp.shape[1]): train_bp[trial, ch, :] = train_bp[trial, ch, :] - np.mean( train_bp[trial, ch, :]) # plt.figure() # plt.plot(train_bp[7,:].T) # plt.savefig(str(FILTER_METHOD)+'.png') # Normalization (train_normalized, trainShiftFactor, trainScaleFactor) = normalizeAcrossEpoch(train_bp, 'MinMax') # Downsampling train_downsampling = train_normalized[:, :, ::decim_factor] # Merge (reshape) channel and time for the PCA train_reshaped = train_downsampling.reshape( train_downsampling.shape[0], -1) # PCA initialisation if APPLY_PCA is False: pca = None train_pcaed = train_reshaped else: pca = PCA(0.95) pca.fit(train_reshaped) pca.components_ = -pca.components_ # inversion of vector to be constistant with Inaki's code train_pcaed = pca.transform(train_reshaped) # PCA # train_pcaed = train_reshaped ## Test data processing ## test_data = epochs._data[test] test_label = label[test] # Compute_feature does the same steps as for train, but requires a computed PCA (that we got from train) # (bandpass, norm, ds, and merge channel and time) test_pcaed = compute_features(test_data, sfreq, l_freq, h_freq, decim_factor, trainShiftFactor, trainScaleFactor, pca, FILTER_METHOD, tmin, tmax, paddingIdx, iir_params=dict(a=a, b=b)) # test_pcaed = compute_features(test_data,sfreq,l_freq,h_freq,decim_factor,trainShiftFactor,trainScaleFactor,pca=None) ## Test ## train_x = train_pcaed test_x = test_pcaed # Classifier init # RF = dict(trees=100, maxdepth=None) # cls = RandomForestClassifier(n_estimators=RF['trees'], max_features='auto', max_depth=RF['maxdepth'], n_jobs=n_jobs) # cls = RandomForestClassifier(n_estimators=RF['trees'], max_features='auto', max_depth=RF['maxdepth'], class_weight="balanced", n_jobs=n_jobs) # cls = LDA(solver='eigen') # cls = QDA(reg_param=0.3) # regularized LDA # cls.fit( train_x, train_label ) # Y_pred= cls.predict( test_x ) # prediction = Y_pred # Fitting cls = rLDA(regcoeff) cls.fit(train_x, train_label) predicted = cls.predict(test_x) probs = cls.predict_proba(test_x) prediction = np.array(predicted) if useLeaveOneOut is True: if firstIterCV is True: probabilities = np.append(probabilities, probs, axis=1) firstIterCV = False predictions = np.append(predictions, prediction) else: probabilities = np.append(probabilities, probs, axis=0) predictions = np.append(predictions, prediction) else: predictions = np.append(predictions, prediction) probabilities = np.append(probabilities, probs) # Performance if useLeaveOneOut is not True: cm = np.array(confusion_matrix(test_label, prediction)) cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] confusion_matrixes.append(cm) confusion_matrixes_percent.append(cm_normalized) avg_confusion_matrixes = np.mean(confusion_matrixes_percent, axis=0) print('CV #' + str(count)) print('Prediction: ' + str(prediction)) print(' Actual: ' + str(test_label)) # Append probs to the global list probs_np = np.array(probs) cv_probabilities.append(probs_np[:, 0]) cv_probabilities_label.append(test_label) # if useLeaveOneOut is not True: # print('Confusion matrix') # print(cm) # print('Confusion matrix (normalized)') # print(cm_normalized) # print('---') # print('True positive rate: '+str(cm_normalized[0][0])) # print('True negative rate: '+str(cm_normalized[1][1])) print('===================') ## One CV done, go to the next one count += 1 best_threshold = None cv_prob_linear = np.ravel(cv_probabilities) cv_prob_label_np = np.array(cv_probabilities_label) cv_prob_label_linear = np.ravel(cv_prob_label_np) threshold_list = np.linspace(0, 1, 100) biglist_fpr = [] biglist_tpr = [] biglist_thresh = [] biglist_cms = [] for thresh in threshold_list: biglist_pred = [ 4 if x < thresh else 3 for x in cv_prob_linear ] # list comprehension to quickly go through the list. biglist_cm = confusion_matrix(cv_prob_label_linear, biglist_pred) biglist_cm_norm = biglist_cm.astype('float') / biglist_cm.sum( axis=1)[:, np.newaxis] biglist_cms.append(biglist_cm_norm) biglist_tpr.append(biglist_cm_norm[0][0]) biglist_fpr.append(biglist_cm_norm[1][0]) biglist_thresh.append(thresh) biglist_auc = auc(biglist_fpr, biglist_tpr) # Make a subset of data where FPR < MAX_FPR idx_below_maxfpr = np.where(np.array(biglist_fpr) < MAX_FPR) fpr_below_maxfpr = np.array(biglist_fpr)[idx_below_maxfpr[0]] tpr_below_maxfpr = np.array(biglist_tpr)[idx_below_maxfpr[0]] # Look for the best (max value) FPR in that subset best_tpr_below_maxfpr = np.max(tpr_below_maxfpr) best_tpr_below_maxfpr_idx = np.array( np.where( biglist_tpr == best_tpr_below_maxfpr)).ravel() # get its idx # Get the associated TPRs best_tpr_below_maxfpr_associated_fpr = np.array( biglist_fpr)[best_tpr_below_maxfpr_idx] # Get the best (min value) in that subset best_associated_fpr = np.min(best_tpr_below_maxfpr_associated_fpr) # ... get its idx best_associated_fpr_idx = np.array( np.where(biglist_fpr == best_associated_fpr)).ravel() # The best idx is the one that is on both set best_idx = best_tpr_below_maxfpr_idx[np.in1d(best_tpr_below_maxfpr_idx, best_associated_fpr_idx)] plt.plot(biglist_fpr, biglist_tpr) plt.xlabel('False positive rate') plt.ylabel('True positive rate') best_threshold = threshold_list[best_idx] print('#################################') print('Best treshold:' + str(best_threshold)) print('Gives a TPR of ' + str(best_tpr_below_maxfpr)) print('And a FPR of ' + str(best_associated_fpr)) print('CM') print(biglist_cms[best_idx[0]]) return (biglist_auc, best_threshold)
def cross_validate(cfg, featdata, cv_file=None): """ Perform cross validation """ # Init a classifier selected_classifier = cfg.CLASSIFIER['selected'] if selected_classifier == 'GB': cls = GradientBoostingClassifier( loss='deviance', learning_rate=cfg.CLASSIFIER['GB']['learning_rate'], presort='auto', n_estimators=cfg.CLASSIFIER['GB']['trees'], subsample=1.0, max_depth=cfg.CLASSIFIER['GB']['depth'], random_state=cfg.CLASSIFIER['GB']['seed'], max_features='sqrt', verbose=0, warm_start=False) elif selected_classifier == 'XGB': cls = XGBClassifier( loss='deviance', learning_rate=cfg.CLASSIFIER['XGB']['learning_rate'], presort='auto', n_estimators=cfg.CLASSIFIER['XGB']['trees'], subsample=1.0, max_depth=cfg.CLASSIFIER['XGB']['depth'], random_state=cfg.CLASSIFIER['XGB'], max_features='sqrt', verbose=0, warm_start=False) elif selected_classifier == 'RF': cls = RandomForestClassifier( n_estimators=cfg.CLASSIFIER['RF']['trees'], max_features='auto', max_depth=cfg.CLASSIFIER['RF']['depth'], n_jobs=cfg.N_JOBS, random_state=cfg.CLASSIFIER['RF']['seed'], oob_score=False, class_weight='balanced_subsample') elif selected_classifier == 'LDA': cls = LDA() elif selected_classifier == 'rLDA': cls = rLDA(cfg.CLASSIFIER['rLDA']['r_coeff']) else: logger.error('Unknown classifier type %s' % selected_classifier) raise ValueError # Setup features X_data = featdata['X_data'] Y_data = featdata['Y_data'] wlen = featdata['wlen'] # Choose CV type ntrials, nsamples, fsize = X_data.shape selected_cv = cfg.CV_PERFORM['selected'] if selected_cv == 'LeaveOneOut': logger.info_green('%d-fold leave-one-out cross-validation' % ntrials) if SKLEARN_OLD: cv = LeaveOneOut(len(Y_data)) else: cv = LeaveOneOut() elif selected_cv == 'StratifiedShuffleSplit': logger.info_green( '%d-fold stratified cross-validation with test set ratio %.2f' % (cfg.CV_PERFORM[selected_cv]['folds'], cfg.CV_PERFORM[selected_cv]['test_ratio'])) if SKLEARN_OLD: cv = StratifiedShuffleSplit( Y_data[:, 0], cfg.CV_PERFORM[selected_cv]['folds'], test_size=cfg.CV_PERFORM[selected_cv]['test_ratio'], random_state=cfg.CV_PERFORM[selected_cv]['seed']) else: cv = StratifiedShuffleSplit( n_splits=cfg.CV_PERFORM[selected_cv]['folds'], test_size=cfg.CV_PERFORM[selected_cv]['test_ratio'], random_state=cfg.CV_PERFORM[selected_cv]['seed']) else: logger.error('%s is not supported yet. Sorry.' % cfg.CV_PERFORM[cfg.CV_PERFORM['selected']]) raise NotImplementedError logger.info('%d trials, %d samples per trial, %d feature dimension' % (ntrials, nsamples, fsize)) # Do it! timer_cv = qc.Timer() scores, cm_txt = crossval_epochs(cv, X_data, Y_data, cls, cfg.tdef.by_value, cfg.CV['BALANCE_SAMPLES'], n_jobs=cfg.N_JOBS, ignore_thres=cfg.CV['IGNORE_THRES'], decision_thres=cfg.CV['DECISION_THRES']) t_cv = timer_cv.sec() # Export results txt = 'Cross validation took %d seconds.\n' % t_cv txt += '\n- Class information\n' txt += '%d epochs, %d samples per epoch, %d feature dimension (total %d samples)\n' %\ (ntrials, nsamples, fsize, ntrials * nsamples) for ev in np.unique(Y_data): txt += '%s: %d trials\n' % (cfg.tdef.by_value[ev], len(np.where(Y_data[:, 0] == ev)[0])) if cfg.CV['BALANCE_SAMPLES']: txt += 'The number of samples was balanced using %ssampling.\n' % cfg.BALANCE_SAMPLES.lower( ) txt += '\n- Experiment condition\n' txt += 'Sampling frequency: %.3f Hz\n' % featdata['sfreq'] txt += 'Spatial filter: %s (channels: %s)\n' % (cfg.SP_FILTER, cfg.SP_CHANNELS) txt += 'Spectral filter: %s\n' % cfg.TP_FILTER[cfg.TP_FILTER['selected']] txt += 'Notch filter: %s\n' % cfg.NOTCH_FILTER[ cfg.NOTCH_FILTER['selected']] txt += 'Channels: ' + ','.join( [str(featdata['ch_names'][p]) for p in featdata['picks']]) + '\n' txt += 'PSD range: %.1f - %.1f Hz\n' % (cfg.FEATURES['PSD']['fmin'], cfg.FEATURES['PSD']['fmax']) txt += 'Window step: %.2f msec\n' % ( 1000.0 * cfg.FEATURES['PSD']['wstep'] / featdata['sfreq']) if type(wlen) is list: for i, w in enumerate(wlen): txt += 'Window size: %.1f msec\n' % (w * 1000.0) txt += 'Epoch range: %s sec\n' % (cfg.EPOCH[i]) else: txt += 'Window size: %.1f msec\n' % (cfg.FEATURES['PSD']['wlen'] * 1000.0) txt += 'Epoch range: %s sec\n' % (cfg.EPOCH) txt += 'Decimation factor: %d\n' % cfg.FEATURES['PSD']['decim'] # Compute stats cv_mean, cv_std = np.mean(scores), np.std(scores) txt += '\n- Average CV accuracy over %d epochs (random seed=%s)\n' % ( ntrials, cfg.CV_PERFORM[cfg.CV_PERFORM['selected']]['seed']) if cfg.CV_PERFORM[cfg.CV_PERFORM['selected']] in [ 'LeaveOneOut', 'StratifiedShuffleSplit' ]: txt += "mean %.3f, std: %.3f\n" % (cv_mean, cv_std) txt += 'Classifier: %s, ' % selected_classifier if selected_classifier == 'RF': txt += '%d trees, %s max depth, random state %s\n' % ( cfg.CLASSIFIER['RF']['trees'], cfg.CLASSIFIER['RF']['depth'], cfg.CLASSIFIER['RF']['seed']) elif selected_classifier == 'GB' or selected_classifier == 'XGB': txt += '%d trees, %s max depth, %s learing_rate, random state %s\n' % ( cfg.CLASSIFIER['GB']['trees'], cfg.CLASSIFIER['GB']['depth'], cfg.CLASSIFIER['GB']['learning_rate'], cfg.CLASSIFIER['GB']['seed']) elif selected_classifier == 'rLDA': txt += 'regularization coefficient %.2f\n' % cfg.CLASSIFIER['rLDA'][ 'r_coeff'] if cfg.CV['IGNORE_THRES'] is not None: txt += 'Decision threshold: %.2f\n' % cfg.CV['IGNORE_THRES'] txt += '\n- Confusion Matrix\n' + cm_txt logger.info(txt) # Export to a file if 'export_result' in cfg.CV_PERFORM[selected_cv] and cfg.CV_PERFORM[ selected_cv]['export_result'] is True: if cv_file is None: if cfg.EXPORT_CLS is True: qc.make_dirs('%s/classifier' % cfg.DATA_PATH) fout = open('%s/classifier/cv_result.txt' % cfg.DATA_PATH, 'w') else: fout = open('%s/cv_result.txt' % cfg.DATA_PATH, 'w') else: fout = open(cv_file, 'w') fout.write(txt) fout.close()
def train_decoder(cfg, featdata, feat_file=None): """ Train the final decoder using all data """ # Init a classifier selected_classifier = cfg.CLASSIFIER['selected'] if selected_classifier == 'GB': cls = GradientBoostingClassifier( loss='deviance', learning_rate=cfg.CLASSIFIER[selected_classifier]['learning_rate'], n_estimators=cfg.CLASSIFIER[selected_classifier]['trees'], subsample=1.0, max_depth=cfg.CLASSIFIER[selected_classifier]['depth'], random_state=cfg.CLASSIFIER[selected_classifier]['seed'], max_features='sqrt', verbose=0, warm_start=False, presort='auto') elif selected_classifier == 'XGB': cls = XGBClassifier( loss='deviance', learning_rate=cfg.CLASSIFIER[selected_classifier]['learning_rate'], n_estimators=cfg.CLASSIFIER[selected_classifier]['trees'], subsample=1.0, max_depth=cfg.CLASSIFIER[selected_classifier]['depth'], random_state=cfg.GB['seed'], max_features='sqrt', verbose=0, warm_start=False, presort='auto') elif selected_classifier == 'RF': cls = RandomForestClassifier( n_estimators=cfg.CLASSIFIER[selected_classifier]['trees'], max_features='auto', max_depth=cfg.CLASSIFIER[selected_classifier]['depth'], n_jobs=cfg.N_JOBS, random_state=cfg.CLASSIFIER[selected_classifier]['seed'], oob_score=False, class_weight='balanced_subsample') elif selected_classifier == 'LDA': cls = LDA() elif selected_classifier == 'rLDA': cls = rLDA(cfg.CLASSIFIER[selected_classifier][r_coeff]) else: logger.error('Unknown classifier %s' % selected_classifier) raise ValueError # Setup features X_data = featdata['X_data'] Y_data = featdata['Y_data'] wlen = featdata['wlen'] if cfg.FEATURES['PSD']['wlen'] is None: cfg.FEATURES['PSD']['wlen'] = wlen w_frames = featdata['w_frames'] ch_names = featdata['ch_names'] X_data_merged = np.concatenate(X_data) Y_data_merged = np.concatenate(Y_data) if cfg.CV['BALANCE_SAMPLES']: X_data_merged, Y_data_merged = balance_samples( X_data_merged, Y_data_merged, cfg.CV['BALANCE_SAMPLES'], verbose=True) # Start training the decoder logger.info_green('Training the decoder') timer = qc.Timer() cls.n_jobs = cfg.N_JOBS cls.fit(X_data_merged, Y_data_merged) logger.info('Trained %d samples x %d dimension in %.1f sec' %\ (X_data_merged.shape[0], X_data_merged.shape[1], timer.sec())) cls.n_jobs = 1 # always set n_jobs=1 for testing # Export the decoder classes = {c: cfg.tdef.by_value[c] for c in np.unique(Y_data)} if cfg.FEATURES['selected'] == 'PSD': data = dict(cls=cls, ch_names=ch_names, psde=featdata['psde'], sfreq=featdata['sfreq'], picks=featdata['picks'], classes=classes, epochs=cfg.EPOCH, w_frames=w_frames, w_seconds=cfg.FEATURES['PSD']['wlen'], wstep=cfg.FEATURES['PSD']['wstep'], spatial=cfg.SP_FILTER, spatial_ch=featdata['picks'], spectral=cfg.TP_FILTER[cfg.TP_FILTER['selected']], spectral_ch=featdata['picks'], notch=cfg.NOTCH_FILTER[cfg.NOTCH_FILTER['selected']], notch_ch=featdata['picks'], multiplier=cfg.MULTIPLIER, ref_ch=cfg.REREFERENCE[cfg.REREFERENCE['selected']], decim=cfg.FEATURES['PSD']['decim']) clsfile = '%s/classifier/classifier-%s.pkl' % (cfg.DATA_PATH, platform.architecture()[0]) qc.make_dirs('%s/classifier' % cfg.DATA_PATH) qc.save_obj(clsfile, data) logger.info('Decoder saved to %s' % clsfile) # Reverse-lookup frequency from FFT fq = 0 if type(cfg.FEATURES['PSD']['wlen']) == list: fq_res = 1.0 / cfg.FEATURES['PSD']['wlen'][0] else: fq_res = 1.0 / cfg.FEATURES['PSD']['wlen'] fqlist = [] while fq <= cfg.FEATURES['PSD']['fmax']: if fq >= cfg.FEATURES['PSD']['fmin']: fqlist.append(fq) fq += fq_res # Show top distinctive features if cfg.FEATURES['selected'] == 'PSD': logger.info_green('Good features ordered by importance') if selected_classifier in ['RF', 'GB', 'XGB']: keys, values = qc.sort_by_value(list(cls.feature_importances_), rev=True) elif selected_classifier in ['LDA', 'rLDA']: keys, values = qc.sort_by_value(cls.w, rev=True) keys = np.array(keys) values = np.array(values) if cfg.EXPORT_GOOD_FEATURES: if feat_file is None: gfout = open('%s/classifier/good_features.txt' % cfg.DATA_PATH, 'w') else: gfout = open(feat_file, 'w') if type(wlen) is not list: ch_names = [ch_names[c] for c in featdata['picks']] else: ch_names = [] for w in range(len(wlen)): for c in featdata['picks']: ch_names.append('w%d-%s' % (w, ch_names[c])) chlist, hzlist = features.feature2chz(keys, fqlist, ch_names=ch_names) valnorm = values[:cfg.FEAT_TOPN].copy() valsum = np.sum(valnorm) if valsum == 0: valsum = 1 valnorm = valnorm / valsum * 100.0 # show top-N features for i, (ch, hz) in enumerate(zip(chlist, hzlist)): if i >= cfg.FEAT_TOPN: break txt = '%-3s %5.1f Hz normalized importance %-6s raw importance %-6s feature %-5d' %\ (ch, hz, '%.2f%%' % valnorm[i], '%.2f%%' % (values[i] * 100.0), keys[i]) logger.info(txt) if cfg.EXPORT_GOOD_FEATURES: gfout.write('Importance(%) Channel Frequency Index\n') for i, (ch, hz) in enumerate(zip(chlist, hzlist)): gfout.write('%.3f\t%s\t%s\t%d\n' % (values[i] * 100.0, ch, hz, keys[i])) gfout.close()
def balance_tpr(cfg, featdata): """ Find the threshold of class index 0 that yields equal number of true positive samples of each class. Currently only available for binary classes. Params ====== cfg: config module feetdata: feature data computed using compute_features() """ n_jobs = cfg.N_JOBS if n_jobs is None: n_jobs = mp.cpu_count() if n_jobs > 1: logger.info('balance_tpr(): Using %d cores' % n_jobs) pool = mp.Pool(n_jobs) results = [] # Init a classifier selected_classifier = cfg.CLASSIFIER[cfg.CLASSIFIER['selected']] if selected_classifier == 'GB': cls = GradientBoostingClassifier( loss='deviance', learning_rate=cfg.CLASSIFIER['GB']['learning_rate'], n_estimators=cfg.CLASSIFIER['GB']['trees'], subsample=1.0, max_depth=cfg.CLASSIFIER['GB']['depth'], random_state=cfg.CLASSIFIER[selected_classifier]['seed'], max_features='sqrt', verbose=0, warm_start=False, presort='auto') elif selected_classifier == 'XGB': cls = XGBClassifier( loss='deviance', learning_rate=cfg.CLASSIFIER['XGB']['learning_rate'], n_estimators=cfg.CLASSIFIER['XGB']['trees'], subsample=1.0, max_depth=cfg.CLASSIFIER['XGB']['depth'], random_state=cfg.CLASSIFIER['XGB']['seed'], max_features='sqrt', verbose=0, warm_start=False, presort='auto') elif selected_classifier == 'RF': cls = RandomForestClassifier( n_estimators=cfg.CLASSIFIER['RF']['trees'], max_features='auto', max_depth=cfg.CLASSIFIER['RF']['depth'], n_jobs=cfg.N_JOBS, random_state=cfg.CLASSIFIER['RF']['seed'], oob_score=False, class_weight='balanced_subsample') elif selected_classifier == 'LDA': cls = LDA() elif selected_classifier == 'rLDA': cls = rLDA(cfg.CLASSIFIER['rLDA']) else: logger.error('Unknown classifier type %s' % selected_classifier) raise ValueError # Setup features X_data = featdata['X_data'] Y_data = featdata['Y_data'] wlen = featdata['wlen'] if cfg.CLASSIFIER['PSD']['wlen'] is None: cfg.CLASSIFIER['PSD']['wlen'] = wlen # Choose CV type ntrials, nsamples, fsize = X_data.shape selected_CV = cfg.CV_PERFORM[cfg.CV_PERFORM['selected']] if cselected_CV == 'LeaveOneOut': logger.info_green('\n%d-fold leave-one-out cross-validation' % ntrials) if SKLEARN_OLD: cv = LeaveOneOut(len(Y_data)) else: cv = LeaveOneOut() elif selected_CV == 'StratifiedShuffleSplit': logger.info_green( '\n%d-fold stratified cross-validation with test set ratio %.2f' % (cfg.CV_PERFORM[selected_CV]['folds'], cfg.CV_PERFORM[selected_CV]['test_ratio'])) if SKLEARN_OLD: cv = StratifiedShuffleSplit( Y_data[:, 0], cfg.CV_PERFORM[selected_CV]['folds'], test_size=cfg.CV_PERFORM[selected_CV]['test_ratio'], random_state=cfg.CV_PERFORM[selected_CV]['random_seed']) else: cv = StratifiedShuffleSplit( n_splits=cfg.CV_PERFORM[selected_CV]['folds'], test_size=cfg.CV_PERFORM[selected_CV]['test_ratio'], random_state=cfg.CV_PERFORM[selected_CV]['random_seed']) else: logger.error('%s is not supported yet. Sorry.' % selected_CV) raise NotImplementedError logger.info('%d trials, %d samples per trial, %d feature dimension' % (ntrials, nsamples, fsize)) # For classifier itself, single core is usually faster cls.n_jobs = 1 Y_preds = [] if SKLEARN_OLD: splits = cv else: splits = cv.split(X_data, Y_data[:, 0]) for cnum, (train, test) in enumerate(splits): X_train = np.concatenate(X_data[train]) X_test = np.concatenate(X_data[test]) Y_train = np.concatenate(Y_data[train]) Y_test = np.concatenate(Y_data[test]) if n_jobs > 1: results.append( pool.apply_async( get_predict_proba, [cls, X_train, Y_train, X_test, Y_test, cnum + 1])) else: Y_preds.append( get_predict_proba(cls, X_train, Y_train, X_test, Y_test, cnum + 1)) cnum += 1 # Aggregate predictions if n_jobs > 1: pool.close() pool.join() for r in results: Y_preds.append(r.get()) Y_preds = np.concatenate(Y_preds, axis=0) # Find threshold for class index 0 Y_preds = sorted(Y_preds) mid_idx = int(len(Y_preds) / 2) if len(Y_preds) == 1: return 0.5 # should not reach here in normal conditions elif len(Y_preds) % 2 == 0: thres = Y_preds[mid_idx - 1] + (Y_preds[mid_idx] - Y_preds[mid_idx - 1]) / 2 else: thres = Y_preds[mid_idx] return thres
def cross_validate(cfg, featdata, cv_file=None): """ Perform cross validation """ # Init a classifier if cfg.CLASSIFIER == 'GB': cls = GradientBoostingClassifier(loss='deviance', learning_rate=cfg.GB['learning_rate'], n_estimators=cfg.GB['trees'], subsample=1.0, max_depth=cfg.GB['max_depth'], random_state=cfg.GB['seed'], max_features='sqrt', verbose=0, warm_start=False, presort='auto') elif cfg.CLASSIFIER == 'XGB': cls = XGBClassifier(loss='deviance', learning_rate=cfg.GB['learning_rate'], n_estimators=cfg.GB['trees'], subsample=1.0, max_depth=cfg.GB['max_depth'], random_state=cfg.GB['seed'], max_features='sqrt', verbose=0, warm_start=False, presort='auto') elif cfg.CLASSIFIER == 'RF': cls = RandomForestClassifier(n_estimators=cfg.RF['trees'], max_features='auto', max_depth=cfg.RF['max_depth'], n_jobs=cfg.N_JOBS, random_state=cfg.RF['seed'], oob_score=True, class_weight='balanced_subsample') elif cfg.CLASSIFIER == 'LDA': cls = LDA() elif cfg.CLASSIFIER == 'rLDA': cls = rLDA(cfg.RLDA_REGULARIZE_COEFF) else: raise ValueError('Unknown classifier type %s' % cfg.CLASSIFIER) # Setup features X_data = featdata['X_data'] Y_data = featdata['Y_data'] wlen = featdata['wlen'] if cfg.PSD['wlen'] is None: cfg.PSD['wlen'] = wlen # Choose CV type ntrials, nsamples, fsize = X_data.shape if cfg.CV_PERFORM == 'LeaveOneOut': print('\n>> %d-fold leave-one-out cross-validation' % ntrials) if SKLEARN_OLD: cv = LeaveOneOut(len(Y_data)) else: cv = LeaveOneOut() elif cfg.CV_PERFORM == 'StratifiedShuffleSplit': print( '\n>> %d-fold stratified cross-validation with test set ratio %.2f' % (cfg.CV_FOLDS, cfg.CV_TEST_RATIO)) if SKLEARN_OLD: cv = StratifiedShuffleSplit(Y_data[:, 0], cfg.CV_FOLDS, test_size=cfg.CV_TEST_RATIO, random_state=cfg.CV_RANDOM_SEED) else: cv = StratifiedShuffleSplit(n_splits=cfg.CV_FOLDS, test_size=cfg.CV_TEST_RATIO, random_state=cfg.CV_RANDOM_SEED) else: raise NotImplementedError('%s is not supported yet. Sorry.' % cfg.CV_PERFORM) print('%d trials, %d samples per trial, %d feature dimension' % (ntrials, nsamples, fsize)) # Do it! timer_cv = qc.Timer() scores, cm_txt = crossval_epochs(cv, X_data, Y_data, cls, cfg.tdef.by_value, cfg.BALANCE_SAMPLES, n_jobs=cfg.N_JOBS, ignore_thres=cfg.CV_IGNORE_THRES, decision_thres=cfg.CV_DECISION_THRES) t_cv = timer_cv.sec() # Export results txt = '\n>> Cross validation took %d seconds.\n' % t_cv txt += '\n- Class information\n' txt += '%d epochs, %d samples per epoch, %d feature dimension (total %d samples)\n' %\ (ntrials, nsamples, fsize, ntrials * nsamples) for ev in np.unique(Y_data): txt += '%s: %d trials\n' % (cfg.tdef.by_value[ev], len(np.where(Y_data[:, 0] == ev)[0])) if cfg.BALANCE_SAMPLES: txt += 'The number of samples was balanced across classes. Method: %s\n' % cfg.BALANCE_SAMPLES txt += '\n- Experiment conditions\n' txt += 'Spatial filter: %s (channels: %s)\n' % (cfg.SP_FILTER, cfg.SP_FILTER) txt += 'Spectral filter: %s\n' % cfg.TP_FILTER txt += 'Notch filter: %s\n' % cfg.NOTCH_FILTER txt += 'Channels: ' + ','.join( [str(featdata['ch_names'][p]) for p in featdata['picks']]) + '\n' txt += 'PSD range: %.1f - %.1f Hz\n' % (cfg.PSD['fmin'], cfg.PSD['fmax']) txt += 'Window step: %.2f msec\n' % (1000.0 * cfg.PSD['wstep'] / featdata['sfreq']) if type(wlen) is list: for i, w in enumerate(wlen): txt += 'Window size: %.1f msec\n' % (w * 1000.0) txt += 'Epoch range: %s sec\n' % (cfg.EPOCH[i]) else: txt += 'Window size: %.1f msec\n' % (cfg.PSD['wlen'] * 1000.0) txt += 'Epoch range: %s sec\n' % (cfg.EPOCH) # Compute stats cv_mean, cv_std = np.mean(scores), np.std(scores) txt += '\n- Average CV accuracy over %d epochs (random seed=%s)\n' % ( ntrials, cfg.CV_RANDOM_SEED) if cfg.CV_PERFORM in ['LeaveOneOut', 'StratifiedShuffleSplit']: txt += "mean %.3f, std: %.3f\n" % (cv_mean, cv_std) txt += 'Classifier: %s, ' % cfg.CLASSIFIER if cfg.CLASSIFIER == 'RF': txt += '%d trees, %s max depth, random state %s\n' % ( cfg.RF['trees'], cfg.RF['max_depth'], cfg.RF['seed']) elif cfg.CLASSIFIER == 'GB' or cfg.CLASSIFIER == 'XGB': txt += '%d trees, %s max depth, %s learing_rate, random state %s\n' % ( cfg.GB['trees'], cfg.GB['max_depth'], cfg.GB['learning_rate'], cfg.GB['seed']) elif cfg.CLASSIFIER == 'rLDA': txt += 'regularization coefficient %.2f\n' % cfg.RLDA_REGULARIZE_COEFF if cfg.CV_IGNORE_THRES is not None: txt += 'Decision threshold: %.2f\n' % cfg.CV_IGNORE_THRES txt += '\n- Confusion Matrix\n' + cm_txt print(txt) # Export to a file if hasattr( cfg, 'CV_EXPORT_RESULT' ) and cfg.CV_EXPORT_RESULT is True and cfg.CV_PERFORM is not None: if cv_file is None: if cfg.EXPORT_CLS is True: qc.make_dirs('%s/classifier' % cfg.DATADIR) fout = open('%s/classifier/cv_result.txt' % cfg.DATADIR, 'w') else: fout = open('%s/cv_result.txt' % cfg.DATADIR, 'w') else: fout = open(cv_file, 'w') fout.write(txt) fout.close()
def createClassifier(loadedraw,\ events,\ tmin,\ tmax,\ tlow,\ thigh,\ regcoeff,\ useLeaveOneOut,\ APPLY_CAR,\ APPLY_PCA,\ l_freq,\ h_freq,\ MAX_FPR,\ picks_feat,\ baselineRange,\ decim_factor,\ cv_container,\ FILTER_METHOD,\ best_threshold,\ verbose=False): tdef, sfreq, event_id, b, a, zi, t_lower, t_upper, epochs, wframes = preprocess(loadedraw=loadedraw,\ events=events,\ APPLY_CAR=APPLY_CAR,\ l_freq=l_freq,\ h_freq=h_freq,\ filter_method=FILTER_METHOD,\ tmin=tmin,\ tmax=tmax,\ tlow=tlow,\ thigh=thigh,\ n_jobs=n_jobs,\ picks_feat=picks_feat,\ baselineRange=baselineRange, verbose=False) train_pcaed, pca, trainShiftFactor, trainScaleFactor = compute_features(signals=epochs._data,\ dataset_type='train',\ sfreq=sfreq,\ l_freq=l_freq,\ h_freq=h_freq,\ decim_factor=decim_factor,\ shiftFactor=None,\ scaleFactor=None,\ pca=None,\ tmin=tmin,\ tmax=tmax,\ tlow=tlow,\ thigh=thigh,\ filter_method=FILTER_METHOD) cls = rLDA(regcoeff) label = epochs.events[:, 2] cls.fit(train_pcaed, label) ch_names = [loadedraw.info['ch_names'][c] for c in picks_feat] data = dict(apply_car=APPLY_CAR, sfreq=loadedraw.info['sfreq'],\ picks=picks_feat,\ decim_factor=decim_factor,\ ch_names=ch_names,\ tmin=tmin,\ tmax=tmax,\ tlow=tlow,\ thigh=thigh,\ l_freq=l_freq,\ h_freq=h_freq,\ baselineRange=baselineRange,\ shiftFactor=trainShiftFactor,\ scaleFactor=trainScaleFactor,\ cls=cls,\ pca=pca,\ threshold=best_threshold[0],\ filter_method=FILTER_METHOD,\ wframes=wframes) outdir = DATADIR + '/errp_classifier' qc.make_dirs(outdir) clsfile = outdir + '/errp_classifier.pcl' qc.save_obj(clsfile, data) print('Saved as %s' % clsfile) print('Using ' + str(epochs._data.shape[0]) + ' epochs')
def processCV(loadedraw,\ events,\ tmin,\ tmax,\ tlow,\ thigh,\ regcoeff,\ useLeaveOneOut,\ APPLY_CAR,\ APPLY_PCA,\ l_freq,\ h_freq,\ MAX_FPR,\ picks_feat,\ baselineRange,\ decim_factor,\ cv_container,\ FILTER_METHOD,\ verbose=False): tdef, sfreq, event_id, b, a, zi, t_lower, t_upper, epochs, wframes = preprocess(loadedraw=loadedraw,\ events=events,\ APPLY_CAR=True,\ l_freq=l_freq,\ h_freq=h_freq,\ filter_method=FILTER_METHOD,\ tmin=tmin,\ tmax=tmax,\ tlow=tlow,\ thigh=thigh,\ n_jobs=n_jobs,\ picks_feat=picks_feat,\ baselineRange=baselineRange, verbose=False) # %% Fold creation # epochs.events contains the label that we want on the third column # We can then get the relevent data within a fold by doing epochs._data[test] # It will return an array with size ({test}L, [channel]L,{time}L) label = epochs.events[:, 2] cv = StratifiedShuffleSplit(label, n_iter=20, test_size=0.1, random_state=1337) if useLeaveOneOut is True: cv = LeaveOneOut(len(label)) # %% Fold processing count = 1 confusion_matrixes = [] confusion_matrixes_percent = [] predicted = '' test_label = '' firstIterCV = True probabilities = np.array([[]], ndmin=2) predictions = np.array([]) best_threshold = [] cv_probabilities = [] cv_probabilities_label = [] if (cv_container is None): cv_container = [] for train, test in cv: train_data = epochs._data[train] train_label = label[train] test_data = epochs._data[test] test_label = label[test] ## Test data processing ## train_pcaed, pca, trainShiftFactor, trainScaleFactor = compute_features(signals=train_data,\ dataset_type='train',\ sfreq=sfreq,\ l_freq=l_freq,\ h_freq=h_freq,\ decim_factor=decim_factor,\ shiftFactor=None,\ scaleFactor=None,\ pca=None,\ tmin=tmin,\ tmax=tmax,\ tlow=tlow,\ thigh=thigh,\ filter_method=FILTER_METHOD) # Compute_feature does the same steps as for train, but requires a computed PCA (that we got from train) # (bandpass, norm, ds, and merge channel and time) test_pcaed, pca_test_unused, _, _ = compute_features(signals=test_data,\ dataset_type='test',\ sfreq=sfreq,\ l_freq=l_freq,\ h_freq=h_freq,\ decim_factor=decim_factor,\ shiftFactor=trainShiftFactor,\ scaleFactor=trainScaleFactor,\ pca=pca,\ tmin=tmin,\ tmax=tmax,\ tlow=tlow,\ thigh=thigh,\ filter_method=FILTER_METHOD) ## Test ## train_x = train_pcaed test_x = test_pcaed cv_container.append([train_x, test_x, train_label, test_label]) for train_x, test_x, train_label, test_label in cv_container: # Fitting cls = rLDA(regcoeff) cls.fit(train_x, train_label) # AlternativeClassifier init # RF = dict(trees=100, maxdepth=None) # cls = RandomForestClassifier(n_estimators=RF['trees'], max_features='auto', max_depth=RF['maxdepth'], n_jobs=n_jobs) # cls = RandomForestClassifier(n_estimators=RF['trees'], max_features='auto', max_depth=RF['maxdepth'], class_weight="balanced", n_jobs=n_jobs) # cls = LDA(solver='eigen') # cls = QDA(reg_param=0.3) # regularized LDA predicted = cls.predict(test_x) probs = cls.predict_proba(test_x) prediction = np.array(predicted) if useLeaveOneOut is True: if firstIterCV is True: probabilities = np.append(probabilities, probs, axis=1) firstIterCV = False predictions = np.append(predictions, prediction) else: probabilities = np.append(probabilities, probs, axis=0) predictions = np.append(predictions, prediction) else: predictions = np.append(predictions, prediction) probabilities = np.append(probabilities, probs) # Performance if useLeaveOneOut is not True: cm = np.array(confusion_matrix(test_label, prediction)) cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] confusion_matrixes.append(cm) confusion_matrixes_percent.append(cm_normalized) avg_confusion_matrixes = np.mean(confusion_matrixes_percent, axis=0) if verbose is True: print('CV #' + str(count)) print('Prediction: ' + str(prediction)) print(' Actual: ' + str(test_label)) # Append probs to the global list probs_np = np.array(probs) cv_probabilities.append(probs_np[:, 0]) cv_probabilities_label.append(test_label) # if useLeaveOneOut is not True: # print('Confusion matrix') # print(cm) # print('Confusion matrix (normalized)') # print(cm_normalized) # print('---') # print('True positive rate: '+str(cm_normalized[0][0])) # print('True negative rate: '+str(cm_normalized[1][1])) if verbose is True: print('===================') ## One CV done, go to the next one count += 1 best_threshold = None cv_prob_linear = np.ravel(cv_probabilities) cv_prob_label_np = np.array(cv_probabilities_label) cv_prob_label_linear = np.ravel(cv_prob_label_np) threshold_list = np.linspace(0, 1, 100) biglist_fpr = [] biglist_tpr = [] biglist_thresh = [] biglist_cms = [] for thresh in threshold_list: biglist_pred = [4 if x < thresh else 3 for x in cv_prob_linear] # list comprehension to quickly go through the list. biglist_cm = confusion_matrix(cv_prob_label_linear, biglist_pred) biglist_cm_norm = biglist_cm.astype('float') / biglist_cm.sum(axis=1)[:, np.newaxis] biglist_cms.append(biglist_cm_norm) biglist_tpr.append(biglist_cm_norm[0][0]) biglist_fpr.append(biglist_cm_norm[1][0]) biglist_thresh.append(thresh) biglist_auc = auc(biglist_fpr, biglist_tpr) # Make a subset of data where FPR < MAX_FPR idx_below_maxfpr = np.where(np.array(biglist_fpr) < MAX_FPR) fpr_below_maxfpr = np.array(biglist_fpr)[idx_below_maxfpr[0]] tpr_below_maxfpr = np.array(biglist_tpr)[idx_below_maxfpr[0]] # Look for the best (max value) FPR in that subset best_tpr_below_maxfpr = np.max(tpr_below_maxfpr) best_tpr_below_maxfpr_idx = np.array(np.where(biglist_tpr == best_tpr_below_maxfpr)).ravel() # get its idx # Get the associated TPRs best_tpr_below_maxfpr_associated_fpr = np.array(biglist_fpr)[best_tpr_below_maxfpr_idx] # Get the best (min value) in that subset best_associated_fpr = np.min(best_tpr_below_maxfpr_associated_fpr) # ... get its idx best_associated_fpr_idx = np.array(np.where(biglist_fpr == best_associated_fpr)).ravel() # The best idx is the one that is on both set best_idx = best_tpr_below_maxfpr_idx[np.in1d(best_tpr_below_maxfpr_idx, best_associated_fpr_idx)] best_threshold = threshold_list[best_idx] best_cm = biglist_cms[best_idx[0]] if verbose is True: print('#################################') print('FOR THIS CELL') plt.figure() plt.plot(biglist_fpr, biglist_tpr) plt.xlabel('False positive rate') plt.ylabel('True positive rate') print('#################################') print('Best treshold:' + str(best_threshold)) print('Gives a TPR of ' + str(best_tpr_below_maxfpr)) print('And a FPR of ' + str(best_associated_fpr)) print('CM') print(best_cm) print('#################################') return (biglist_auc, best_threshold, best_cm, best_tpr_below_maxfpr, best_associated_fpr, cv_container, biglist_cms)
oversampled_train_label = np.append(train_label, train_label[idx_offset]) oversampled_train_x = np.concatenate( (train_x, train_x[idx_offset]), 0) train_label = oversampled_train_label train_x = oversampled_train_x # RF = dict(trees=1000, maxdepth=None) # cls = RandomForestClassifier(n_estimators=RF['trees'], max_features='auto', max_depth=RF['maxdepth'], n_jobs=n_jobs) # cls = LDA(solver='eigen') # cls = QDA(reg_param=0.3) # regularized LDA # w,b = trainLDA(train_x,train_label, 0.3) # predicted, probs = testLDA(test_x, w, b) rlda = rLDA() rlda.fit(train_x, train_label, 0.3) predicted, probs = rlda.predict_proba(test_x) prediction = np.array(predicted) # cls.fit( train_x, train_label ) # Y_pred= cls.predict( test_x ) # prediction = Y_pred cm = np.array(confusion_matrix(test_label, prediction)) cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] tp_rates.append(cm_normalized[0][0]) tn_rates.append(cm_normalized[1][1]) confusion_matrixes.append(cm) confusion_matrixes_percent.append(cm_normalized)