def fs_continuous(X, y, method): """ All 4 methods are implemented, but for Boruta and MIFS the method is over- riden and set to L1. """ n, p = X.shape if method == 'Boruta': rf = RandomForestRegressor(n_jobs=-1) Boruta = boruta.BorutaPy(rf, n_estimators='auto') Boruta.fit(X, y) selected = np.where(Boruta.support_)[0] elif method == 'JMI': MIFS = mifs.MutualInformationFeatureSelector(method='JMI', categorical=False) MIFS.fit(X, y) selected = np.where(MIFS.support_)[0] elif method == 'L1': lasso = LassoCV(n_jobs=-1, normalize=False) sfm = SelectFromModel(lasso) sfm.fit(X, y) selected = sfm.transform(np.arange(p).reshape(1, -1))[0] elif method == 'FDR': FDR = fs.SelectFdr(fs.f_regression, .05) FDR.fit(X, y) selected = FDR.transform(np.arange(p).reshape(1, -1))[0] return selected
def run_boruta(X, y, p): rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5) feat_selector = boruta.BorutaPy(rf, n_estimators='auto', random_state=1, perc=p) feat_selector.fit(X, y) chosen = [] for i, value in enumerate(feat_selector.support_): if value: chosen.append(i) return chosen
def run_boruta(X, y, p): import time t = time.time() rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5) feat_selector = boruta.BorutaPy(rf, n_estimators='auto', random_state=1, perc=p) feat_selector.fit(X, y) chosen = [] for i, value in enumerate(feat_selector.support_): if value: chosen.append(i) print('RUNNING BORUTA TIME: {} min'.format((time.time() - t) / 60)) return chosen
def do_fs(X, y, method): s, f = X.shape y_test = np.arange(f).reshape(1, -1) if method == "fdr": sel = fs.SelectFdr(fs.f_classif, .05).fit(X, y).transform(y_test)[0] elif method == "l1svc": sel = lsvc_cv.recursive_lsvc_cv(X, y, -3, 3, 7) elif method == "boruta": rf = RandomForestClassifier(n_jobs=-1) b = boruta.BorutaPy(rf, n_estimators='auto') b.fit(X, y) sel = np.where(b.support_)[0] elif method == "jmi": MIFS = mifs.MutualInformationFeatureSelector(method='JMI') MIFS.fit(X, y) sel = np.where(MIFS.support_)[0] return sel
def fs_categorical(X, y, method): n, p = X.shape selected = [] if method == 'Boruta': rf = RandomForestClassifier(n_jobs=-1) Boruta = boruta.BorutaPy(rf, n_estimators='auto') Boruta.fit(X, y) selected = np.where(Boruta.support_)[0] elif method == 'JMI': MIFS = mifs.MutualInformationFeatureSelector(method='JMI') MIFS.fit(X, y) selected = np.where(MIFS.support_)[0] elif method == 'L1': selected = lsvc_cv.recursive_lsvc_cv(X, y, -3, 3, 7) elif method == 'FDR': FDR = fs.SelectFdr(fs.f_classif, .05) FDR.fit(X, y) selected = FDR.transform(np.arange(p).reshape(1, -1))[0] return selected
def borutabench(best_params_rf, X, y, random_state, prefix=""): bor = boruta.BorutaPy(model(params=best_params_rf, random_state=random_state)) bor.fit(X, y) plot_class_list([bor.support_]) plt.title("Frequency of inclusion in AllRel Set (Boruta)") plt.savefig(PATH / (prefix + "freqimpsAR.pdf"))
def do_fs(X, y): s, f = X.shape y_test = np.arange(f).reshape(1, -1) # -------------------------------------------------------------- # UNIVARIATE FEATURE SELECTION # percentile - take the top10% of features sel_uni_perc = fs.SelectPercentile(fs.f_classif, 10).fit(X, y).transform(y_test)[0] # fdr - minimize false discovery rate at alpha = .05 sel_uni_fdr = fs.SelectFdr(fs.f_classif, .05).fit(X, y).transform(y_test)[0] # -------------------------------------------------------------- # RFECV # do a cross-validated grid search for the optimal C gridC = {'C': np.logspace(-6, 3, 10)} svc = LinearSVC(penalty='l1', loss='squared_hinge', dual=False, tol=1e-4) grid_cv = GridSearchCV(svc, gridC, scoring='accuracy', n_jobs=-1) grid_cv.fit(X, y) # set the optimal C # adjust for the smaller training sample size, due to cross validation # http://scikit-learn.org/stable/auto_examples/svm/plot_svm_scale_c.html cv_num = 3 train_size = 1 - 1 / float(cv_num) adjust_c = float(s * train_size) svc.set_params(C=grid_cv.best_params_['C'] * adjust_c) # do a stratified 3 fold cross-validated recursive feature elimination, # with 1% of the worst feautres removed each round rfecv = fs.RFECV(estimator=svc, step=.01, cv=cv_num, scoring='accuracy') rfecv.fit(X, y) sel_rfecv = rfecv.transform(y_test)[0] # -------------------------------------------------------------- # L1 SVC sel_lsvc = lsvc_cv.recursive_lsvc_cv(X, y, -3, 3, 7) # -------------------------------------------------------------- # STABILITY SELECTION rlr = RandomizedLogisticRegression(n_resampling=1000, C=np.logspace(-2, 2, 5), selection_threshold=0.7, sample_fraction=0.5) sel_rlr = rlr.fit(X, y).transform(y_test)[0] # -------------------------------------------------------------- # BORUTA rf = RandomForestClassifier(n_jobs=-1) b = boruta.BorutaPy(rf, n_estimators='auto') b.fit(X, y) sel_b_rf = np.where(b.support_)[0] # -------------------------------------------------------------- # JMI MIFS = mifs.MutualInformationFeatureSelector(method='JMI') MIFS.fit(X, y) sel_jmi = np.where(MIFS.support_)[0] return (sel_uni_perc, sel_uni_fdr, sel_rfecv, sel_lsvc, sel_rlr, sel_b_rf, sel_jmi)