def rfecv_credit_example(image="rfecv_credit.png"):
    X, y = load_credit()

    _, ax = plt.subplots()
    cv = StratifiedKFold(5)
    oz = RFECV(RandomForestClassifier(), ax=ax, cv=cv, scoring="f1_weighted")
    oz.fit(X, y)
    oz.poof(outpath=os.path.join(IMAGES, image))
def rfecv_sklearn_example(image="rfecv_sklearn_example.png"):
    X, y = make_classification(
        n_samples=1000,
        n_features=25,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        random_state=0,
    )

    _, ax = plt.subplots()

    oz = RFECV(SVC(kernel="linear", C=1), ax=ax)
    oz.fit(X, y)
    oz.poof(outpath=os.path.join(IMAGES, image))
Exemple #3
0
# Validation curve
viz = ValidationCurve(XGBRegressor(objective="reg:squarederror"),
                      param_name="max_depth",
                      param_range=np.arange(1, 11),
                      cv=5,
                      scoring="r2")
viz.fit(x_train, y_train)
viz.show()

# Learning curve
model = XGBRegressor(objective="reg:squarederror")
viz_2 = LearningCurve(model, scoring="r2")
viz_2.fit(x_train, y_train)
viz_2.show()

model = RFECV(LassoCV(), cv=5, scoring='r2')
model.fit(x_train, y_train)
model.show()
"""
Section: 5
Time-Series Algorithms
"""
# Fitting ARIMA
# Original Series
# plt.rcParams.update({'figure.figsize':(9,7), 'figure.dpi':120})
fig, axes = plt.subplots(3, 1, sharex=True)
plot_acf(main_data.traffic_volume, ax=axes[0])

# 1st Differencing
plot_acf(main_data.traffic_volume.diff(), ax=axes[1])
def rfecv():
    X, y = load_credit()
    model = RandomForestClassifier(n_estimators=10)
    oz = RFECV(model, cv=3, scoring="f1_weighted", ax=newfig())
    oz.fit(X, y)
    savefig(oz, "rfecv_sklearn_example")
Exemple #5
0
y = fifa_semNan.Overall


# In[24]:


#instancia a classe
reg = LinearRegression()


# In[25]:


#Ranking das features com RFE com cross validation
#O ponto tracejado representa o score maximo com 27 features
rfecv = RFECV(reg,step=1, cv=3)
rfecv.fit(X,y)
rfecv.show()


# In[26]:


#lista de features utilizadas pelo modelo
list(zip(X.columns, rfecv.support_))


# In[27]:


#seleciona 5 features para o modelo utilizando somente RFE
Exemple #6
0
    "auc_roc_ADAboost": 5,
    "auc_roc_perceptron": 6,
    "auc_roc_QDA": 7,
    "auc_roc_linearSVC": 8,
    "auc_roc_dtree": 9,
    "auc_roc_bernoulliNB": 10,
    "auc_roc_LDA": 11,
    "auc_roc_gaussianNB": 12
}

df_results = df_results.replace({"Best model ROC": model_dict})
y = df_results["Best model ROC"]

cv = StratifiedKFold(2)
visualizer = RFECV(RandomForestClassifier(n_estimators=10),
                   cv=cv,
                   scoring='accuracy')

visualizer.fit(X, y)  # Fit the data to the visualizer
visualizer.show()

print("Optimal number of features : %d" % visualizer.n_features_)
print(visualizer.ranking_)
print(visualizer.estimator_.feature_importances_)

index_list = []
for index, value in enumerate(visualizer.ranking_):
    if value == 1:
        index_list.append(index)
    else:
        pass
Exemple #7
0
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1,
                                        min_samples_split=2,
                                        min_weight_fraction_leaf=0,
                                        n_jobs=None,
                                        oob_score=False,
                                        random_state=random_state,
                                        max_depth=7,
                                        max_features="log2",
                                        verbose=0,
                                        warm_start=False)

    rfecv = RFECV(estimator=classifier,
                  step=1,
                  cv=StratifiedKFold(3),
                  scoring='f1_macro')

    print(classifier)
    print(rfecv)

    # endregion

    # region Train Model and plot importance

    rfecv.fit(X_train, Y_train.ravel())

    rfecv.show()

    # endregion
Exemple #8
0
    visualizer.show()

    # 学习率
    visualizer = LearningCurve(model, scoring='f1_weighted')
    visualizer.fit(X_train, y_train)
    visualizer.show()

    # 交叉验证
    visualizer = CVScores(model, cv=5, scoring='f1_weighted')
    visualizer.fit(X_train, y_train)
    visualizer.show()

    # 特征重要性
    visualizer = FeatureImportances(model)
    visualizer.fit(X_train, y_train)
    visualizer.show()

    # 特征递归消减
    visualizer = RFECV(model, cv=5, scoring='f1_weighted')
    visualizer.fit(X_train, y_train)
    visualizer.show()

    # 特征选择
    visualizer = ValidationCurve(model,
                                 param_name="max_depth",
                                 param_range=np.arange(1, 11),
                                 cv=5,
                                 scoring="f1_weighted")
    visualizer.fit(X_train, y_train)
    visualizer.show()
Exemple #9
0
        .format(np.mean(precisions), np.std(precisions)))


CVlog(X, 'X')
#X:
#Avec threshold = 0.5 La moyenne des recalls pendant la cross validation est: 0.589 (~0.053)
#Avec threshold = 0.5 La moyenne des precisions pendant la cross validation est: 0.151 (~0.024)
#CVlog(X,'X',0.81)
#Avec threshold = 0.81 La moyenne des recalls pendant la cross validation est: 0.500 (~0.040)
#Avec threshold = 0.81 La moyenne des precisions pendant la cross validation est: 0.448 (~0.034)

##########################################################################
#############             Réduction de features              #############
# Cette étape peut prendre plusieurs heures (en fonction de l'ordinateur)
lr = LogisticRegression()
rfecv = RFECV(lr, step=1, cv=5, scoring='accuracy')
selector = rfecv.fit(X, Y)
print("The best descriptors after RFECV({}) are: \n {}".format(
    rfecv.n_features_, X.columns[[
        index for index, value in enumerate(list(rfecv.support_))
        if value == True
    ]]))
X_bestFeatures = X.loc[:, X.columns[[
    index for index, value in enumerate(list(rfecv.support_)) if value == True
]]]

CVlog(X_bestFeatures, 'X_bestFeatures')
#Avec threshold = 0.5 La moyenne des recalls pendant la cross validation est: 0.594 (~0.042)
#Avec threshold = 0.5 La moyenne des precisions pendant la cross validation est: 0.149 (~0.026)

##########################################################################

# def check_model(c, n, X, y, X_test, y_test, class_names, outdir):
#     model = svm.LinearSVC(class_weight='balanced', dual=False, max_iter=10000, C=c)
#     rfe = RFE(model, n_features_to_select=n)

#     fit = rfe.fit(X,y)
#     y_predict = fit.predict(X_test)
#     predict_df = pd.DataFrame(y_predict.tolist())
#     predict_df.to_csv(outdir + '/predict_label.csv', sep='\t', index=False)



# test = [[1e-3,12],[1e-6,7],[1.0,32],[1e-6,27]]
# for c,n in test:
#     print(str(c)+ '-' +str(n))
#     this_out = 'figures/eva/c' + str(c) + '_n' + str(n)
#     check_model(c=c, n=n, outdir=this_out, X=X, y=y, X_test=X_test, y_test=y_test,
#               class_names=class_names)


## plot RFECV for LinearSVC
for c in [1e-6, 1e-3, 1]:
    model = svm.LinearSVC(class_weight='balanced', dual=False, max_iter=10000, C=c)
    viz = RFECV(model, scoring='f1_weighted')
    viz.fit(X, y)
    viz.show(outpath='figures/linear_svc_rfecv.pdf')