def acc(classifier, fname, labelF, indF, splits=10, fselect='None', nfeat=100, featmin=3, a=.05, header=True, hmap=False, f=True): if f: filterwarnings("ignore", category=ConvergenceWarning) acc = [] acc_tr = [] # load data if header: cts = pd.read_csv(fname + '.csv', header=0, index_col=0, dtype={0: str}) else: cts = pd.read_csv(fname + '.csv', header=None, index_col=None) ind = pd.read_csv(indF + '.csv', header=None) label = pd.read_csv(labelF + '.csv', header=0, index_col=0) rows = np.where(ind > 0)[0] if cts.shape[0] == len(rows): phi = cts else: phi = cts.iloc[rows] cancer = label.iloc[rows, 0] ind = ind.iloc[rows, 0] i = 0 for i in range(0, splits): rows = np.where(ind != i + 1) X = phi.iloc[rows] s = X.shape if len(s) == 3: X = np.reshape(X, [s[0], s[1] * s[2]]) else: X = np.reshape(X, [s[0], s[1]]) y = cancer.iloc[rows] y = np.reshape(y, s[0]) rows = np.where(ind == i + 1) X_test = phi.iloc[rows] s = X_test.shape if len(s) == 3: X_test = np.reshape(X_test, [s[0], s[1] * s[2]]) else: X_test = np.reshape(X_test, [s[0], s[1]]) y_test = cancer.iloc[rows] y_test = np.reshape(y_test, s[0]) # subset features if 'min' in fselect: cols = np.where(X.astype(bool).sum(axis=0) > featmin)[0] X = X.iloc[:, cols] X_test = X_test.iloc[:, cols] if 'MI' in fselect: model = SelectKBest(mutual_info_classif, k=nfeat).fit(X, y) X = model.transform(X) X_test = model.transform(X_test) elif 'PCA' in fselect: model = PCA(n_components=nfeat).fit(X) X = model.transform(X) X_test = model.transform(X_test) elif 'reg' in fselect: model = SelectFpr(f_classif, alpha=a).fit(X, y) X = model.transform(X) X_test = model.transform(X_test) if hmap: heatmap(X, y, tail='_{0}_train'.format(i)) heatmap(X_test, y_test, tail='_{0}_test'.format(i)) # fit model model = classifier.fit(X, y) # Compute accuracy for validation set probas_ = model.predict_proba(X_test) y_hat = np.argmax(probas_, axis=1) acc.append(sum(y_hat == y_test) / len(y_test)) # Compute accuracy for training set probas_ = model.predict_proba(X) y_hat = np.argmax(probas_, axis=1) acc_tr.append(sum(y_hat == y) / len(y)) i += 1 results = stats.ttest_1samp(acc, popmean=755 / 2126) p_val = results[1] results = stats.ttest_1samp(acc_tr, popmean=755 / 2126) p_val_tr = results[1] return np.mean(acc), np.std(acc), p_val, np.mean(acc_tr), np.std( acc_tr), p_val_tr
def roc(classifier, mdict, pname, splits=10, fselect='None', nfeat=100): tprs = [] aucs = [] aucs_tr = [] mean_fpr = np.linspace(0, 1, 100) # load data phi = mdict.get('phi') testPhi = mdict.get('testPhi') asd = mdict.get('cvTrainASD') testASD = mdict.get('cvTestASD') i = 0 for i in range(0, splits): X = phi[(i, 0)] s = X.shape X = np.reshape(X, [s[0], s[1] * s[2]]) y = asd[(i, 0)] y = np.reshape(y, s[0]) X_test = testPhi[(i, 0)] s = X_test.shape X_test = np.reshape(X_test, [s[0], s[1] * s[2]]) y_test = testASD[(i, 0)] y_test = np.reshape(y_test, s[0]) # reformat if fselect == 'MI': model = SelectKBest(mutual_info_classif, k=nfeat).fit(X, y) X = model.transform(X) X_test = model.transform(X_test) elif fselect == 'PCA': model = PCA(n_components=nfeat).fit(X) #plt.clf() #plt.scatter(range(0, X.shape[1]), model.components_, color='black', s=1) #plt.ylabel('PC value') #plt.xlabel('Index') #plt.savefig('plots/' + pname + '_pcVal_' + str(i+1) + '.png') X = model.transform(X) X_test = model.transform(X_test) #bins =np.linspace(-.005, .025, 20) #plt.hist(X[y == 0], bins=bins, alpha=0.5, label='non-ASD') #plt.hist(X[y == 1], bins=bins, alpha=0.5, label='ASD') #plt.legend(loc='upper right') #plt.savefig('plots/' + pname + '_hist_' + str(i + 1) + '.png') # fit model model = classifier.fit(X, y) probas_ = model.predict_proba(X_test) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y_test, probas_[:, 1]) tprs.append(interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0 roc_auc = auc(fpr, tpr) aucs.append(roc_auc) plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc)) # Compute AUC on Training set probas_ = model.predict_proba(X) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y, probas_[:, 1]) roc_auc = auc(fpr, tpr) aucs_tr.append(roc_auc) i += 1 plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Luck', alpha=.8) mean_tpr = np.mean(tprs, axis=0) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) std_auc = np.std(aucs) plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8) std_tpr = np.std(tprs, axis=0) tprs_upper = np.minimum(mean_tpr + std_tpr, 1) tprs_lower = np.maximum(mean_tpr - std_tpr, 0) plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right", prop={'size': 6}) plt.savefig('plots/' + pname + '.png') plt.clf() results = stats.ttest_1samp(aucs, popmean=0.5) p_val = results[1] results = stats.ttest_1samp(aucs_tr, popmean=0.5) p_val_tr = results[1] return mean_auc, std_auc, p_val, np.mean(aucs_tr), np.std(aucs_tr), p_val_tr