def rfecv_credit_example(image="rfecv_credit.png"): X, y = load_credit() _, ax = plt.subplots() cv = StratifiedKFold(5) oz = RFECV(RandomForestClassifier(), ax=ax, cv=cv, scoring="f1_weighted") oz.fit(X, y) oz.show(outpath=os.path.join(IMAGES, image))
def rfecv_sklearn_example(image="rfecv_sklearn_example.png"): X, y = make_classification( n_samples=1000, n_features=25, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, random_state=0, ) _, ax = plt.subplots() oz = RFECV(SVC(kernel="linear", C=1), ax=ax) oz.fit(X, y) oz.show(outpath=os.path.join(IMAGES, image))
# In[24]: #instancia a classe reg = LinearRegression() # In[25]: #Ranking das features com RFE com cross validation #O ponto tracejado representa o score maximo com 27 features rfecv = RFECV(reg,step=1, cv=3) rfecv.fit(X,y) rfecv.show() # In[26]: #lista de features utilizadas pelo modelo list(zip(X.columns, rfecv.support_)) # In[27]: #seleciona 5 features para o modelo utilizando somente RFE rfe = RFE(reg, n_features_to_select=5, step=1 ) rfe = rfe.fit(X,y)
"auc_roc_dtree": 9, "auc_roc_bernoulliNB": 10, "auc_roc_LDA": 11, "auc_roc_gaussianNB": 12 } df_results = df_results.replace({"Best model ROC": model_dict}) y = df_results["Best model ROC"] cv = StratifiedKFold(2) visualizer = RFECV(RandomForestClassifier(n_estimators=10), cv=cv, scoring='accuracy') visualizer.fit(X, y) # Fit the data to the visualizer visualizer.show() print("Optimal number of features : %d" % visualizer.n_features_) print(visualizer.ranking_) print(visualizer.estimator_.feature_importances_) index_list = [] for index, value in enumerate(visualizer.ranking_): if value == 1: index_list.append(index) else: pass selected_X = df_results.iloc[:, index_list] with open('RFE_X_AUCROC_202.pickle', 'wb') as handle:
# def check_model(c, n, X, y, X_test, y_test, class_names, outdir): # model = svm.LinearSVC(class_weight='balanced', dual=False, max_iter=10000, C=c) # rfe = RFE(model, n_features_to_select=n) # fit = rfe.fit(X,y) # y_predict = fit.predict(X_test) # predict_df = pd.DataFrame(y_predict.tolist()) # predict_df.to_csv(outdir + '/predict_label.csv', sep='\t', index=False) # test = [[1e-3,12],[1e-6,7],[1.0,32],[1e-6,27]] # for c,n in test: # print(str(c)+ '-' +str(n)) # this_out = 'figures/eva/c' + str(c) + '_n' + str(n) # check_model(c=c, n=n, outdir=this_out, X=X, y=y, X_test=X_test, y_test=y_test, # class_names=class_names) ## plot RFECV for LinearSVC for c in [1e-6, 1e-3, 1]: model = svm.LinearSVC(class_weight='balanced', dual=False, max_iter=10000, C=c) viz = RFECV(model, scoring='f1_weighted') viz.fit(X, y) viz.show(outpath='figures/linear_svc_rfecv.pdf')