best_param1 = np.NaN best_param2 = np.NaN best_balacc = np.NaN for clf_name, clf in zip(classifiernames, classifiers): print(clf_name) if clf_name is "XGBoost": best_param1 = np.NaN best_param2 = np.NaN best_balacc = np.NaN clf = XGBClassifier() clf.fit(X_train_selected, y_train) # score = clf.score(X_test_selected, y_test) if hasattr(clf, "decision_function"): score = clf.decision_function(X_test_selected) else: score = clf.predict_proba(X_test_selected) y_pred = clf.predict(X_test_selected) y_train_pred = clf.predict(X_train_selected) predscoredf = pd.DataFrame(data=y_pred, columns=["Survival"]) predscoredf["Score"] = score.tolist() predscoredf.to_csv(os.path.join( featseloutdir, 'ypred_' + sel_name + '_' + clf_name + '_split' + str(split) + '_numfeat' + str(numfeat) + '.csv'), index=None) pd.DataFrame(data=y_train_pred, columns=['Survival']) \ .to_csv(os.path.join(featseloutdir, 'ytrainpred_' + sel_name + '_' + clf_name + '_split' + str(split) + '_numfeat' + str(numfeat)
def main(input_features_path_csv: str, output_path: str, survival_column: str, exclude_columns: str, class_boundary_list: list, num_folds: int, split_path: str, save_splits: bool): print(args.input_features_path_csv) print(input_features_path_csv) np.random.seed(42) assert (os.path.isfile(input_features_path_csv)), "Input feature csv file " assert (type(num_folds) is int and num_folds > 1), "Please enter an int > 1 for the number of folds." input_features = pd.read_csv(input_features_path_csv) assert (survival_column in input_features.columns), "Survival column not found" numfeatlist = np.arange(5, 55, 5) # numfeatlist = np.arange(5, 30, 5) classifiernames = ["Nearest Neighbors", "Linear SVC", "RBF SVC", "Gaussian Process", "Decision Tree", "Random Forest", "Multilayer Perceptron", "AdaBoost", "Naive Bayes", "QDA", "XGBoost", "Logistic Regression" ] classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1), GaussianProcessClassifier(1.0 * RBF(1.0)), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(n_estimators=100, max_features='auto'), MLPClassifier(alpha=1, max_iter=5000), AdaBoostClassifier(), GaussianNB(), QuadraticDiscriminantAnalysis(), XGBClassifier, LogisticRegression() ] selectors = [ reliefF.reliefF, fisher_score.fisher_score, gini_index.gini_index, chi_square.chi_square, JMI.jmi, CIFE.cife, DISR.disr, MIM.mim, CMIM.cmim, ICAP.icap, t_score.t_score, MRMR.mrmr, MIFS.mifs ] selectornames = ["Relief", "Fisher score", "Gini index", "Chi-square", "Joint mutual information", "Conditional infomax feature extraction", "Double input symmetric relevance", "Mutual information maximization", "Conditional mutual information maximization", "Interaction capping", "T-test", "Minimum redundancy maximum relevance", "Mutual information feature selection"] selectornames_short = ["RELF", "FSCR", "GINI", "CHSQ", "JMI", "CIFE", "DISR", "MIM", "CMIM", "ICAP", "TSCR", "MRMR", "MIFS"] for class_boundary in class_boundary_list: currsurvenc = input_features y_arr = pd.Series(survival_classencoding(input_features[survival_column].values, [class_boundary])).values y = pd.DataFrame(survival_classencoding(input_features[survival_column].values, [class_boundary]), columns=["Survival"]) # y = currsurvenc[survival_column].values X = currsurvenc.drop(survival_column, axis=1, inplace=False) try: X.drop("ID", axis=1, inplace=True) except: print('No ID column found') X_arr = X.values # generate splits for stratified cross validation skf = StratifiedKFold(n_splits=num_folds, random_state=None, shuffle=False) splitcount = 0 numfeat = np.linspace(5, 50, 10) for train_index, test_index in skf.split(X_arr, y_arr): # print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X_arr[train_index], X_arr[test_index] y_train, y_test = y_arr[train_index], y_arr[test_index] X_train_df = X.iloc[train_index] X_test_df = X.iloc[test_index] y_train_df = y.iloc[train_index] y_test_df = y.iloc[test_index] X_train_df.to_csv(os.path.join(output_path, 'Xtrain_2class_split_boundary' + str(class_boundary) + '_split' + str(splitcount) + '.csv'), index=False) X_test_df.to_csv(os.path.join(output_path, 'Xtest_2class_split_boundary' + str(class_boundary) + '_split' + str(splitcount) + '.csv'), index=False) y_train_df.to_csv(os.path.join(output_path, 'ytrain_2class_split_boundary' + str(class_boundary) + '_split' + str(splitcount) + '.csv'), index=False) y_test_df.to_csv(os.path.join(output_path, 'ytest_2class_split_boundary' + str(class_boundary) + '_split' + str(splitcount) + '.csv'), index=False) # if it does not already exist, create folder for each class boundary to store selected features and results boundarydir = os.path.join(output_path, str(class_boundary)) if not os.path.exists(boundarydir): os.makedirs(boundarydir) # for every split, perform feature selection for sel_name, sel in zip(selectornames_short, selectors): print('#####') print(sel_name) print('#####') if sel_name is "CHSQ": # shift X values to be non-negative for chsq feature selection X_train_tmp = X_train + np.abs(X_train.min()) selscore = sel(X_train_tmp, y_train) selidx = np.argsort(selscore)[::-1] selidx = selidx[0:50] selscore = selscore[selidx] selscoredf = pd.DataFrame( data=np.transpose(np.vstack((X_train_df.columns[selidx].values, selscore))), columns=['Feature', 'Score']) selscoredf.to_csv( os.path.join(boundarydir, sel_name + '_50features_split' + str(splitcount) + '.csv'), index=None) elif sel_name == "RELF": selscore = sel(X_train, y_train, k=50) selidx = np.argsort(selscore)[::-1] # print(selidx) selidx = selidx[0:50] selscoredf = pd.DataFrame( data=np.transpose(np.vstack((X_train_df.columns[selidx].values, selscore[selidx]))), columns=['Feature', 'Score']) selscoredf.to_csv( os.path.join(boundarydir, sel_name + '_50features_split' + str(splitcount) + '.csv'), index=None) elif sel_name == "JMI" or sel_name == "CIFE" or sel_name == "DISR" or sel_name == "MIM" \ or sel_name == "CMIM" or sel_name == "ICAP" or sel_name == "MRMR" or sel_name == "MIFS": selidx, selscore, _ = sel(X_train, y_train, n_selected_features=50) selscoredf = pd.DataFrame( data=np.transpose(np.vstack((X_train_df.columns[selidx].values, selscore))), columns=['Feature', 'Score']) selscoredf.to_csv( os.path.join(boundarydir, sel_name + '_50features_split' + str(splitcount) + '.csv'), index=None) else: selscore = sel(X_train, y_train) selidx = np.argsort(selscore)[::-1] # print(selidx) selidx = selidx[0:50] selscoredf = pd.DataFrame( data=np.transpose(np.vstack((X_train_df.columns[selidx].values, selscore[selidx]))), columns=['Feature', 'Score']) selscoredf.to_csv( os.path.join(boundarydir, sel_name + '_50features_split' + str(splitcount) + '.csv'), index=None) # get subsets for all number of features for numfeat in numfeatlist: X_train_selected = X_train[:, selidx[0:numfeat]] X_test_selected = X_test[:, selidx[0:numfeat]] ########################################## # do classification with all classifiers # ########################################## for clf_name, clf in zip(classifiernames, classifiers): print(clf_name) if clf_name is "XGBoost": clf = XGBClassifier() clf.fit(X_train_selected, y_train) # score = clf.score(X_test_selected, y_test) if hasattr(clf, "decision_function"): score = clf.decision_function(X_test_selected)[:, 1] else: score = clf.predict_proba(X_test_selected)[:, 1] y_pred = clf.predict(X_test_selected) y_train_pred = clf.predict(X_train_selected) # auc = roc_auc_score(y_test, y_pred) # print('Number of features: ' + str(numfeat) + ', ' + name + ': ' + str(auc)) pd.DataFrame(data=np.transpose(np.vstack((y_pred, score))), columns=['Survival', 'Score'])\ .to_csv(os.path.join(boundarydir, 'ypred_' + sel_name + '_' + clf_name + '_split' + str(splitcount) + '_numfeat' + str(numfeat) + '.csv'), index=None) pd.DataFrame(data=y_train_pred, columns=['Survival'])\ .to_csv(os.path.join(boundarydir, 'ytrainpred_' + sel_name + '_' + clf_name + '_split' + str(splitcount) + '_numfeat' + str(numfeat) + '.csv'), index=None) elif clf_name is "Nearest Neighbors": numneighbors = np.arange(3, 22, 3) for num_n in numneighbors: clf = KNeighborsClassifier(n_neighbors=num_n) clf.fit(X_train_selected, y_train) # score = clf.score(X_test_selected, y_test) if hasattr(clf, "decision_function"): score = clf.decision_function(X_test_selected)[:, 1] else: score = clf.predict_proba(X_test_selected)[:, 1] y_pred = clf.predict(X_test_selected) y_train_pred = clf.predict(X_train_selected) # auc = roc_auc_score(y_test, y_pred) # print('Number of features: ' + str(numfeat) + ', ' + name + ': ' + str(auc)) pd.DataFrame(data=np.transpose(np.vstack((y_pred, score))), columns=['Survival', 'Score']) \ .to_csv(os.path.join(boundarydir, 'ypred_' + sel_name + '_' + clf_name + '_numN' + str(num_n) + '_split' + str(splitcount) + '_numfeat' + str(numfeat) + '.csv'), index=None) pd.DataFrame(data=y_train_pred, columns=['Survival']) \ .to_csv(os.path.join(boundarydir, 'ytrainpred_' + sel_name + '_' + clf_name + '_numN' + str(num_n) + '_split' + str(splitcount) + '_numfeat' + str(numfeat) + '.csv'), index=None) elif clf_name is "Linear SVC": costparam = [0.25, 0.5, 1, 2, 4] for c in costparam: clf = SVC(kernel="linear", C=c) clf.fit(X_train_selected, y_train) # score = clf.score(X_test_selected, y_test) if hasattr(clf, "decision_function"): score = clf.decision_function(X_test_selected) else: score = clf.predict_proba(X_test_selected)[:, 1] y_pred = clf.predict(X_test_selected) y_train_pred = clf.predict(X_train_selected) pd.DataFrame(data=np.transpose(np.vstack((y_pred, score))), columns=['Survival', 'Score']) \ .to_csv(os.path.join(boundarydir, 'ypred_' + sel_name + '_' + clf_name + '_C' + str(c) + '_split' + str(splitcount) + '_numfeat' + str(numfeat) + '.csv'), index=None) pd.DataFrame(data=y_train_pred, columns=['Survival']) \ .to_csv(os.path.join(boundarydir, 'ytrainpred_' + sel_name + '_' + clf_name + '_C' + str(c) + '_split' + str(splitcount) + '_numfeat' + str(numfeat) + '.csv'), index=None) elif clf_name is "RBF SVC": costparam = [0.25, 0.5, 1, 2, 4] gamma = ['scale', 'auto', 0.01, 0.1, 1, 10, 100] for c in costparam: for g in gamma: clf = SVC(gamma=g, C=c) clf.fit(X_train_selected, y_train) # score = clf.score(X_test_selected, y_test) if hasattr(clf, "decision_function"): score = clf.decision_function(X_test_selected) else: score = clf.predict_proba(X_test_selected)[:, 1] y_pred = clf.predict(X_test_selected) y_train_pred = clf.predict(X_train_selected) pd.DataFrame(data=np.transpose(np.vstack((y_pred, score))), columns=['Survival', 'Score']) \ .to_csv(os.path.join(boundarydir, 'ypred_' + sel_name + '_' + clf_name + '_C' + str(c) + '_gamma' + str(g) + '_split' + str(splitcount) + '_numfeat' + str(numfeat) + '.csv'), index=None) pd.DataFrame(data=y_train_pred, columns=['Survival']) \ .to_csv(os.path.join(boundarydir, 'ytrainpred_' + sel_name + '_' + clf_name + '_C' + str(c) + '_gamma' + str(g) + '_split' + str(splitcount) + '_numfeat' + str(numfeat) + '.csv'), index=None) elif clf_name is "Decision Tree": maxdepthlist = [5, 10, 15, 20] for d in maxdepthlist: clf = DecisionTreeClassifier(max_depth=d) clf.fit(X_train_selected, y_train) # score = clf.score(X_test_selected, y_test) if hasattr(clf, "decision_function"): score = clf.decision_function(X_test_selected)[:, 1] else: score = clf.predict_proba(X_test_selected)[:, 1] y_pred = clf.predict(X_test_selected) y_train_pred = clf.predict(X_train_selected) # auc = roc_auc_score(y_test, y_pred) # print('Number of features: ' + str(numfeat) + ', ' + name + ': ' + str(auc)) pd.DataFrame(data=np.transpose(np.vstack((y_pred, score))), columns=['Survival', 'Score']) \ .to_csv(os.path.join(boundarydir, 'ypred_' + sel_name + '_' + clf_name + '_maxd' + str(d) + '_split' + str(splitcount) + '_numfeat' + str(numfeat) + '.csv'), index=None) pd.DataFrame(data=y_train_pred, columns=['Survival']) \ .to_csv(os.path.join(boundarydir, 'ytrainpred_' + sel_name + '_' + clf_name + '_maxd' + str(d) + '_split' + str(splitcount) + '_numfeat' + str(numfeat) + '.csv'), index=None) elif clf_name is "Multilayer Perceptron": alpha = [0.0001, 0.001, 0.01, 0.1, 1, 10] for a in alpha: clf = MLPClassifier(alpha=a, max_iter=5000) clf.fit(X_train_selected, y_train) # score = clf.score(X_test_selected, y_test) if hasattr(clf, "decision_function"): score = clf.decision_function(X_test_selected)[:, 1] else: score = clf.predict_proba(X_test_selected)[:, 1] y_pred = clf.predict(X_test_selected) y_train_pred = clf.predict(X_train_selected) pd.DataFrame(data=np.transpose(np.vstack((y_pred, score))), columns=['Survival', 'Score']) \ .to_csv(os.path.join(boundarydir, 'ypred_' + sel_name + '_' + clf_name + '_alpha' + str(a) + '_split' + str(splitcount) + '_numfeat' + str(numfeat) + '.csv'), index=None) pd.DataFrame(data=y_train_pred, columns=['Survival']) \ .to_csv(os.path.join(boundarydir, 'ytrainpred_' + sel_name + '_' + clf_name + '_alpha' + str(a) + '_split' + str(splitcount) + '_numfeat' + str(numfeat) + '.csv'), index=None) else: clf.fit(X_train_selected, y_train) # score = clf.score(X_test_selected, y_test) if hasattr(clf, "decision_function"): score = clf.decision_function(X_test_selected) # print(score) else: score = clf.predict_proba(X_test_selected)[:, 1] y_pred = clf.predict(X_test_selected) y_train_pred = clf.predict(X_train_selected) # auc = roc_auc_score(y_test, y_pred) # print('Number of features: ' + str(numfeat) + ', ' + name + ': ' + str(auc)) pd.DataFrame(data=np.transpose(np.vstack((y_pred, score))), columns=['Survival', 'Score'])\ .to_csv(os.path.join(boundarydir, 'ypred_' + sel_name + '_' + clf_name + '_split' + str(splitcount) + '_numfeat' +str(numfeat) + '.csv'), index=None) pd.DataFrame(data=y_train_pred, columns=['Survival'])\ .to_csv(os.path.join(boundarydir, 'ytrainpred_' + sel_name + '_' + clf_name + '_split' + str(splitcount) + '_numfeat' +str(numfeat) + '.csv'), index=None) # print(splitcount) splitcount += 1