def select_rfe(X, y, k_features=3): ''' Signature: rfe(predictors, target, k_features=3) Docstring: Parameters ---------- pandas.core.frame.DataFrame Returns ------- ''' lm = LinearRegression() rfe_init = rfe(lm, k_features) rfe_init.fit(X, y) rfe_mask = rfe_init.support_ rfe_features = X.iloc[:, rfe_mask].columns.to_list() print(f"Recursive Feature Elimination: {len(rfe_features)} features") print(rfe_features) return None
#retain unscaled features post2000_exp = pd.concat([numfeat_post, catfeat_post_bin],axis = 1) post2000_exp = post2000_exp[pre2000_exp.columns] #BUILD DECISION TREE MODEL (SCALED DATA)--------------------------------------- #instantiate decision tree model with coeff dt = TreeClassifierWithCoef(criterion = 'gini', splitter = 'best', max_features = None, max_depth = None, min_samples_split = 2, min_samples_leaf = 2, max_leaf_nodes = None, random_state = 1) #conduct recursive feature search dt_rfe_cv = rfe(estimator=dt, step=1, cv=10, scoring='roc_auc', verbose = 1) dt_rfe_cv.fit(pre2000_exp_scaled, pre2000_res) #identify and plot optimal number of features (d = 17), ROC_AUC = 0.8262 print dt_rfe_cv.n_features_ print dt_rfe_cv.grid_scores_.max() plt.figure() plt.xlabel("DT: Number of Features selected") plt.ylabel("DT: Cross Validation Score (ROC_AUC)") plt.plot(range(1, len(dt_rfe_cv.grid_scores_) + 1), dt_rfe_cv.grid_scores_) plt.show() #identify selected features dt_features = pre2000_exp_scaled.columns[dt_rfe_cv.get_support()] print dt_features
df_gini, df_pc, on="Features", how="inner" ) # Join by column while keeping only items that exist in both, select outer or left for other options df_features = df_join["Features"] # Save features from data frame features = df_features.tolist() # Convert to list ## Setup Predictors and RFE df_rfe = df_nev[features] # Add selected features to df df_rfe["outcome"] = df_nev["outcome"] # Add outcome to RFE df df_rfe = df_rfe.dropna() # Drop all columns with NA X = df_rfe[df_features] # Save features columns as predictor data frame Y = df_rfe["outcome"] # Use outcome data frame Log_RFE = LogisticRegression( solver="liblinear", max_iter=4000) # Use regression coefficient as estimator selector = rfe( estimator=Log_RFE, step=1, min_features_to_select=1 ) # define selection parameters, in this case all features are selected. See Readme for more ifo ## Run Recursive Feature Selection selected = selector.fit(X, Y) # This will take time ## Output RFE results ar_rfe = selected.support_ # Save Boolean values as numpy array l_rfe = list(zip(X, ar_rfe)) # Create list of variables alongside RFE value df_rfe = pd.DataFrame(l_rfe, columns=[ "Features", "RFE" ]) # Create data frame of importances with variables and gini column names df_rfe = df_rfe[df_rfe.RFE == True] # Select Variables that were True df_rfe = df_rfe.drop(columns=["RFE"]) # Drop Unwanted Columns ## Verify
#scale data scaler = pp.StandardScaler() scaler.fit(post2000_exp) post2000_exp_scaled = pd.DataFrame(scaler.transform(post2000_exp), index=post2000_exp.index, columns=post2000_exp.columns) #retain unscaled features post2000_exp = pd.concat([numfeat_post, catfeat_post_bin], axis=1) post2000_exp = post2000_exp[pre2000_exp.columns] #IDENTIFY POTENTIAL FEATURES WITH RECURSIVE FEATURE SEARCH AND 10-FOLD CV------ #run recursive feature search with 10-fold cv to identify potential features lr = lm.LogisticRegression() lr_rfe_cv = rfe(estimator=lr, step=1, cv=10, scoring='roc_auc', verbose=1) lr_rfe_cv.fit(pre2000_exp_scaled, pre2000_res) #identify features features = pre2000_exp_scaled.columns[lr_rfe_cv.get_support()] print features #run 10-fold CV to get scores with selected features (ROC_AUC = 0.9451) lr_cv = cv(lr, pre2000_exp_scaled[features], pre2000_res, cv=10, scoring='roc_auc') lr_cv.mean() #create dataset with response and selected features
# BUILD DECISION TREE MODEL (SCALED DATA)--------------------------------------- # instantiate decision tree model with coeff dt = TreeClassifierWithCoef( criterion="gini", splitter="best", max_features=None, max_depth=None, min_samples_split=2, min_samples_leaf=2, max_leaf_nodes=None, random_state=1, ) # conduct recursive feature search dt_rfe_cv = rfe(estimator=dt, step=1, cv=10, scoring="roc_auc", verbose=1) dt_rfe_cv.fit(pre2000_exp_scaled, pre2000_res) # identify and plot optimal number of features (d = 17), ROC_AUC = 0.8262 print dt_rfe_cv.n_features_ print dt_rfe_cv.grid_scores_.max() plt.figure() plt.xlabel("DT: Number of Features selected") plt.ylabel("DT: Cross Validation Score (ROC_AUC)") plt.plot(range(1, len(dt_rfe_cv.grid_scores_) + 1), dt_rfe_cv.grid_scores_) plt.show() # identify selected features dt_features = pre2000_exp_scaled.columns[dt_rfe_cv.get_support()] print dt_features
scaler.fit(post2000_exp) post2000_exp_scaled = pd.DataFrame(scaler.transform(post2000_exp), index = post2000_exp.index, columns = post2000_exp.columns) #retain unscaled features post2000_exp = pd.concat([numfeat_post, catfeat_post_bin],axis = 1) post2000_exp = post2000_exp[pre2000_exp.columns] #IDENTIFY POTENTIAL FEATURES WITH RECURSIVE FEATURE SEARCH AND 10-FOLD CV------ #run recursive feature search with 10-fold cv to identify potential features lr = lm.LogisticRegression() lr_rfe_cv = rfe(estimator=lr, step=1, cv=10, scoring='roc_auc', verbose = 1) lr_rfe_cv.fit(pre2000_exp_scaled, pre2000_res) #identify features features = pre2000_exp_scaled.columns[lr_rfe_cv.get_support()] print features #run 10-fold CV to get scores with selected features (ROC_AUC = 0.9451) lr_cv = cv(lr, pre2000_exp_scaled[features], pre2000_res, cv=10, scoring='roc_auc') lr_cv.mean() #create dataset with response and selected features lrset = pd.concat([pre2000_exp_scaled[features], pre2000_res], axis=1)
axis=0) indices = np.argsort(importances)[::-1] print("Feature ranking:") proba = [0] * 5 for f in range(x_train.shape[1]): #each line in proba correspond to one histone marker print("%d. Histone %d (%f)" % (f + 1, indices[f] % 5, importances[indices[f]])) proba[indices[f] % 5] += importances[indices[f]] print(proba) print(sum(proba)) """-> Features selection with RFECV with our logistic regression classifier as estimator""" rfecv = rfe(estimator=XGBClassifier(n_estimators=650, max_depth=18), step=50, verbose=1, cv=10) rfecv.fit(x_train, y_train) rfecv_scores = rfecv.grid_scores_ #now we are going to identify the number of features selected from each histone marker support = rfecv.support_ nb_selected_features = 0 histone_marker = [] #histone marker H3K4me3 for i in range(0, 500, 5): if (support[i] == True): nb_selected_features += 1 histone_marker.append(['H3K4me3', nb_selected_features]) #histone marker H3K4me1