Esempio n. 1
0
def show_FeatureImportances(
    est: BaseEstimator,
    conf_mat_labels: List,
    X: DataFrame,
    y: Series,
    fig_size: Tuple = (8, 8),
    savefig: Path = Path().cwd() / "reports" / "figures" / "feats_imps.png",
    save_pref: bool = False,
) -> None:
    """Show feature importances"""
    fig, ax = plt.subplots(figsize=fig_size)
    cm = FeatureImportances(est,
                            stack=True,
                            labels=conf_mat_labels,
                            relative=False,
                            ax=ax)
    cm.fit(X, y)
    cm.show()
    if save_pref and not savefig.is_file():
        fig.savefig(savefig, bbox_inches="tight", dpi=300)
    logistic_score = balanced_accuracy_score(y_test, y_pred_Logisticregression.round(), adjusted=False)
    #logistic_score_acc=accuracy_score(y_test, y_pred_Logisticregression)
    scores.append(logistic_score)
    print(logistic_score)
    #print(logistic_score_acc)


    
    
    #from sklearn.metrics import accuracy_score
    #logistic_score = balanced_accuracy_score(test_scores_encoded, y_pred_Logisticregression.round(), adjusted=False)
    #logistic_score_acc=accuracy_score(test_scores_encoded, y_pred_Logisticregression.round(), normalize=False)
    #print(logistic_score)
    #print(logistic_score_acc)
data_score = pd.DataFrame(columns=['Commodity', 'score'])
data_score['Commodity'] = y_location_trains.columns
data_score['score'] = scores
print(data_score)
data_score.to_csv('/Users/monalisa/Downloads/mmai823-project-master/out/linear_scores.csv')


print(X_train.columns)
# Feature importance
#viz_feat = FeatureImportances(rfc, labels=X_train.columns, relative=False)
from matplotlib import pyplot as plt
%matplotlib inline
viz_features = FeatureImportances(logistic, labels=X_train.columns)
viz_features.fit(X_train, y_train)
viz_features.show()
plt.tight_layout()
selector = RFE(
    estimator, verbose=0, n_features_to_select=40
)  #Citation - scikit-learn.org sklearn API reference documentation

selector.fit(X, y)

#Select the best features
selected = [X.columns[i] for i in selector.get_support(indices=True)
            ]  #Citation - scikit-learn.org sklearn API reference documentation

selected

X = X[selected]

plt.figure(figsize=(11, 9))
ax = plt.gca(
)  #Citation - from matplotlib.org matplotlip API reference documentation

# Title case the feature for better display and create the visualizer
model = RandomForestClassifier(
    n_jobs=-1
)  #Citation - scikit-learn.org sklearn API reference documentation
labels = list(map(lambda s: s.title(), X.columns))
viz = FeatureImportances(
    model, labels=labels,
    relative=True)  #Citation - scikit-yb.org yellowbrick documentation

# Fit and show the feature importances
viz.fit(X, y)
viz.show(ax=ax)
#task 8: Feature importance and evolution metrics

from yellowbrick.model_selection import FeatureImportances
plt.rcParams['figure.figsize'] = (12,8)
plt.style.use("ggplot")

rf = RandomForestClassifer(bootstrap = 'True', class_weight = None, criterion='gini',
                           max_depth=5, max_feature='auto', max_leaf_nodes = None,
                           min_impurity_decrease=0.0, min_impurity_split= None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                           oob_score=False, random_state=1, verbose=False,
                           warn_start=False)
viz= FeatureImportances(rf)
viz.fit(X_train,y_train)
viz.show();


dt = DecisionForestClassifer(class_weight = None, criterion='gini',
                           max_depth=3, max_feature='None', max_leaf_nodes = None,
                           min_impurity_decrease=0.0, min_impurity_split= None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, presort=False,random_state=0,
                           splitters='best')
                           
viz= FeatureImportances(dt)
viz.fit(X_train,y_train)
viz.show();


from yellowbrick.classifer import ROCAUC
Esempio n. 5
0
    print(accuracy_score(y_train,estimator.predict(X_train)))
    print(accuracy_score(y_test,estimator.predict(X_test)))
    num_tree = estimator.estimators_[0]
    graph = Source(tree.export_graphviz(num_tree,out_file=None,feature_names=X_train.columns,class_names=['0','1'],filled=True))
    display(Image(data=graph.pipe(format='png')))
    return estimator
# %%
plt.rcParams['figure.figsize'] = (12,8)
plt.style.use("ggplot")
rf = RandomForestClassifier(bootstrap='True',class_weight=None,criterion='gini',max_depth=3
                            ,max_features='auto',max_leaf_nodes=None,min_impurity_decrease=1
                            ,min_samples_split=2,min_weight_fraction_leaf=0.0,n_estimators=100
                            ,n_jobs=1,oob_score=False,random_state=1,verbose=False,warm_start=False)
viz = FeatureImportances(rf)
viz.fit(X_train,y_train)
viz.show()
# %%
visualizer = ROCAUC(rf,classes=['stayed','quit'])
visualizer.fit(X_train,y_train)
visualizer.score(X_test,y_test)
visualizer.poof()
# %%
dt = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

visualizer = ROCAUC(dt, classes=["stayed", "quit"])
X_1 = data[features]
y_1 = data.popularity

X_2 = data_without_old[features]
y_2 = data_without_old.popularity

X_3 = data_without_old[features_2]
y_3 = data_without_old.popularity

train_X_1, test_X_1, train_y_1, test_y_1 = train_test_split(X_1, y_1, test_size=0.1, random_state=0)
train_X_2, test_X_2, train_y_2, test_y_2 = train_test_split(X_2, y_2, test_size=0.1, random_state=0)
train_X_3, test_X_3, train_y_3, test_y_3 = train_test_split(X_3, y_3, test_size=0.1, random_state=0)


rfr_model_1 = RandomForestRegressor()
feature_importance_1 = FeatureImportances(rfr_model_1)
rfr_model_1.fit(train_X_1, train_y_1)

rfr_model_2 = RandomForestRegressor()
feature_importance_2 = FeatureImportances(rfr_model_2)
rfr_model_2.fit(train_X_2, train_y_2)

rfr_model_3 = RandomForestRegressor()
feature_importance_3 = FeatureImportances(rfr_model_3)
rfr_model_3.fit(train_X_3, train_y_3)


feature_importance_1.show()
feature_importance_2.show()
feature_importance_3.show()