def rfecv_credit_example(image="rfecv_credit.png"): X, y = load_credit() _, ax = plt.subplots() cv = StratifiedKFold(5) oz = RFECV(RandomForestClassifier(), ax=ax, cv=cv, scoring="f1_weighted") oz.fit(X, y) oz.poof(outpath=os.path.join(IMAGES, image))
def rfecv_sklearn_example(image="rfecv_sklearn_example.png"): X, y = make_classification( n_samples=1000, n_features=25, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, random_state=0, ) _, ax = plt.subplots() oz = RFECV(SVC(kernel="linear", C=1), ax=ax) oz.fit(X, y) oz.poof(outpath=os.path.join(IMAGES, image))
# Validation curve viz = ValidationCurve(XGBRegressor(objective="reg:squarederror"), param_name="max_depth", param_range=np.arange(1, 11), cv=5, scoring="r2") viz.fit(x_train, y_train) viz.show() # Learning curve model = XGBRegressor(objective="reg:squarederror") viz_2 = LearningCurve(model, scoring="r2") viz_2.fit(x_train, y_train) viz_2.show() model = RFECV(LassoCV(), cv=5, scoring='r2') model.fit(x_train, y_train) model.show() """ Section: 5 Time-Series Algorithms """ # Fitting ARIMA # Original Series # plt.rcParams.update({'figure.figsize':(9,7), 'figure.dpi':120}) fig, axes = plt.subplots(3, 1, sharex=True) plot_acf(main_data.traffic_volume, ax=axes[0]) # 1st Differencing plot_acf(main_data.traffic_volume.diff(), ax=axes[1])
def rfecv(): X, y = load_credit() model = RandomForestClassifier(n_estimators=10) oz = RFECV(model, cv=3, scoring="f1_weighted", ax=newfig()) oz.fit(X, y) savefig(oz, "rfecv_sklearn_example")
y = fifa_semNan.Overall # In[24]: #instancia a classe reg = LinearRegression() # In[25]: #Ranking das features com RFE com cross validation #O ponto tracejado representa o score maximo com 27 features rfecv = RFECV(reg,step=1, cv=3) rfecv.fit(X,y) rfecv.show() # In[26]: #lista de features utilizadas pelo modelo list(zip(X.columns, rfecv.support_)) # In[27]: #seleciona 5 features para o modelo utilizando somente RFE
"auc_roc_ADAboost": 5, "auc_roc_perceptron": 6, "auc_roc_QDA": 7, "auc_roc_linearSVC": 8, "auc_roc_dtree": 9, "auc_roc_bernoulliNB": 10, "auc_roc_LDA": 11, "auc_roc_gaussianNB": 12 } df_results = df_results.replace({"Best model ROC": model_dict}) y = df_results["Best model ROC"] cv = StratifiedKFold(2) visualizer = RFECV(RandomForestClassifier(n_estimators=10), cv=cv, scoring='accuracy') visualizer.fit(X, y) # Fit the data to the visualizer visualizer.show() print("Optimal number of features : %d" % visualizer.n_features_) print(visualizer.ranking_) print(visualizer.estimator_.feature_importances_) index_list = [] for index, value in enumerate(visualizer.ranking_): if value == 1: index_list.append(index) else: pass
max_leaf_nodes=None, min_impurity_decrease=0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0, n_jobs=None, oob_score=False, random_state=random_state, max_depth=7, max_features="log2", verbose=0, warm_start=False) rfecv = RFECV(estimator=classifier, step=1, cv=StratifiedKFold(3), scoring='f1_macro') print(classifier) print(rfecv) # endregion # region Train Model and plot importance rfecv.fit(X_train, Y_train.ravel()) rfecv.show() # endregion
visualizer.show() # 学习率 visualizer = LearningCurve(model, scoring='f1_weighted') visualizer.fit(X_train, y_train) visualizer.show() # 交叉验证 visualizer = CVScores(model, cv=5, scoring='f1_weighted') visualizer.fit(X_train, y_train) visualizer.show() # 特征重要性 visualizer = FeatureImportances(model) visualizer.fit(X_train, y_train) visualizer.show() # 特征递归消减 visualizer = RFECV(model, cv=5, scoring='f1_weighted') visualizer.fit(X_train, y_train) visualizer.show() # 特征选择 visualizer = ValidationCurve(model, param_name="max_depth", param_range=np.arange(1, 11), cv=5, scoring="f1_weighted") visualizer.fit(X_train, y_train) visualizer.show()
.format(np.mean(precisions), np.std(precisions))) CVlog(X, 'X') #X: #Avec threshold = 0.5 La moyenne des recalls pendant la cross validation est: 0.589 (~0.053) #Avec threshold = 0.5 La moyenne des precisions pendant la cross validation est: 0.151 (~0.024) #CVlog(X,'X',0.81) #Avec threshold = 0.81 La moyenne des recalls pendant la cross validation est: 0.500 (~0.040) #Avec threshold = 0.81 La moyenne des precisions pendant la cross validation est: 0.448 (~0.034) ########################################################################## ############# Réduction de features ############# # Cette étape peut prendre plusieurs heures (en fonction de l'ordinateur) lr = LogisticRegression() rfecv = RFECV(lr, step=1, cv=5, scoring='accuracy') selector = rfecv.fit(X, Y) print("The best descriptors after RFECV({}) are: \n {}".format( rfecv.n_features_, X.columns[[ index for index, value in enumerate(list(rfecv.support_)) if value == True ]])) X_bestFeatures = X.loc[:, X.columns[[ index for index, value in enumerate(list(rfecv.support_)) if value == True ]]] CVlog(X_bestFeatures, 'X_bestFeatures') #Avec threshold = 0.5 La moyenne des recalls pendant la cross validation est: 0.594 (~0.042) #Avec threshold = 0.5 La moyenne des precisions pendant la cross validation est: 0.149 (~0.026) ##########################################################################
# def check_model(c, n, X, y, X_test, y_test, class_names, outdir): # model = svm.LinearSVC(class_weight='balanced', dual=False, max_iter=10000, C=c) # rfe = RFE(model, n_features_to_select=n) # fit = rfe.fit(X,y) # y_predict = fit.predict(X_test) # predict_df = pd.DataFrame(y_predict.tolist()) # predict_df.to_csv(outdir + '/predict_label.csv', sep='\t', index=False) # test = [[1e-3,12],[1e-6,7],[1.0,32],[1e-6,27]] # for c,n in test: # print(str(c)+ '-' +str(n)) # this_out = 'figures/eva/c' + str(c) + '_n' + str(n) # check_model(c=c, n=n, outdir=this_out, X=X, y=y, X_test=X_test, y_test=y_test, # class_names=class_names) ## plot RFECV for LinearSVC for c in [1e-6, 1e-3, 1]: model = svm.LinearSVC(class_weight='balanced', dual=False, max_iter=10000, C=c) viz = RFECV(model, scoring='f1_weighted') viz.fit(X, y) viz.show(outpath='figures/linear_svc_rfecv.pdf')