def answer_five(): from sklearn.tree import DecisionTreeClassifier from adspy_shared_utilities import plot_decision_tree #TM from adspy_shared_utilities import plot_feature_importances #TM from sklearn.svm import SVC #TM # Your code here clf_dt = DecisionTreeClassifier(max_depth = 5, min_samples_leaf = 10, random_state = 0).fit(X_train2, y_train2) clf_SVC = SVC(kernel = 'linear', C=10).fit(X_train2, y_train2) #plot_decision_tree(clf, X_train2.columns, y_train2.columns) print('Mushroom dataset: decision tree') print('Accuracy of DT classifier on training set: {:.2f}' .format(clf_dt.score(X_train2, y_train2))) print('Accuracy of DT classifier on test set: {:.2f}'.format(clf_dt.score(X_test2, y_test2))) print('\n') print('Mushroom dataset: SVC') print('Accuracy of SVC classifier on training set: {:.2f}' .format(clf_SVC.score(X_train2, y_train2))) print('Accuracy of SVC classifier on test set: {:.2f}'.format(clf_SVC.score(X_test2, y_test2))) plt.figure(figsize=(10,6),dpi=80) plot_feature_importances(clf_dt, X_train2.columns ) plt.tight_layout() plt.show() return True# Your answer here
def show_dectree_info(): """ Display info on decission tree clf passed as argunent. """ clf = DecisionTreeClassifier(random_state=0).fit(X_train2, y_train2) fi = pd.DataFrame(data=clf.feature_importances_, index=X_train2.columns, columns=['feature importance']) fi.sort_values('feature importance', ascending=False, inplace=True) top_fi = fi.head(5) print('Feature importances: {}'.format(top_fi)) plt.figure(figsize=(10, 4), dpi=80) plot_feature_importances(clf, X_test2.columns) plt.show()
def answer_five(): from sklearn.tree import DecisionTreeClassifier from adspy_shared_utilities import plot_feature_importances clf = DecisionTreeClassifier(random_state=0).fit(X_train2, y_train2) plot_feature_importances(clf, X_train2.columns) importances = clf.feature_importances_ # Sort feature importances in descending order indices = np.argsort(importances)[::-1] # Rearrange feature names so they match the sorted feature importances names = [X_train2.columns[i] for i in indices] return names[:5]
def feature_importance(): from adspy_shared_utilities import plot_feature_importances import matplotlib.pyplot as plt ################################################## ### Jupyter Notebooks Needs This Line #%matplotlib notebook ################################################## #plt.clf() plt.figure(figsize=(10, 4), dpi=80) plot_feature_importances(clf, feature_names_list) #print('Feature importances: {}'.format(clf.feature_importances_)) print( 'Feature importances: {:.3f}, {:.3f}, {:.3f}, {:.3f}, {:.3f}, {:.3f}, {:.3f}' .format( clf.feature_importances_[0], clf.feature_importances_[1], clf.feature_importances_[2], clf.feature_importances_[3], clf.feature_importances_[4], clf.feature_importances_[5], clf.feature_importances_[6], )) return plt.show()
plot_decision_tree(clf, iris.feature_names, iris.target_names) # #### Pre-pruned version (max_depth = 3) # In[ ]: plot_decision_tree(clf2, iris.feature_names, iris.target_names) # #### Feature importance # In[ ]: from adspy_shared_utilities import plot_feature_importances plt.figure(figsize=(10, 4), dpi=80) plot_feature_importances(clf, iris.feature_names) plt.show() print('Feature importances: {}'.format(clf.feature_importances_)) # In[ ]: from sklearn.tree import DecisionTreeClassifier from adspy_shared_utilities import plot_class_regions_for_classifier_subplot X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0) fig, subaxes = plt.subplots(6, 1, figsize=(6, 32)) pair_list = [[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]
from adspy_shared_utilities import plot_decision_tree from adspy_shared_utilities import plot_feature_importances from sklearn import tree mush_df = pd.read_csv("mushrooms.csv") print(mush_df) mush_df2 = pd.get_dummies(mush_df) print(mush_df2) X_mush = mush_df2.iloc[:,2:] y_mush = mush_df2.iloc[:,1] print(X_mush) print(y_mush) X_train2, X_test2, y_train2, y_test2 = train_test_split(X_mush, y_mush, random_state=0) X_subset = X_test2 y_subset = y_test2 model = DecisionTreeClassifier(random_state=0) clf = model.fit(X_train2, y_train2) print(clf.score(X_train2,y_train2)) plt.figure(figsize=(10, 20)) plot_feature_importances(clf, X_train2.columns) # plt.show() # print(-np.sort(-clf.feature_importances_)) sorted = np.argsort(-clf.feature_importances_) names=[] # Most important 5 features for i in range(5): names.append(X_train2.columns[sorted[:5]][i]) print(names)
# Visualizing decision trees (TODO: make it display): plt.figure() plot_decision_tree(clf, iris.feature_names, iris.target_names) # (Figure 22) # Color intensity represents which majority class is present in each node. # values section corresponds to how many training instances belong in each class. # Visualizing (pre-pruned version max_depth = 3) plt.figure() plot_decision_tree(clf2, iris.feature_names, iris.target_names) # (Figure 23) # Feature importance from adspy_shared_utilities import plot_feature_importances plt.figure(figsize=(10,4), dpi=80) # Dots per inch plot_feature_importances(clf, iris.feature_names) # (Figure 24) print("Feature importances: {}\n".format(clf.feature_importances_)) # Inherent property of classifier, not user-defined property #from sklearn.tree import DecisionTreeClassifier #from adspy_shared_utilities import plot_class_regions_for_classifier_subplot as pcr X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0) fig, subaxes = plt.subplots(1, 6, figsize=(32, 6)) pair_list = [[0, 1], [0, 2], [0, 3], [1, 2] ,[1, 3], [2, 3]] tree_max_depth = 4 for pair, axis in zip(pair_list, subaxes): X = X_train[:, pair] y = y_train clf = DecisionTreeClassifier(max_depth=tree_max_depth).fit(X, y)
def decisiontree(): iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=3) clf = DecisionTreeClassifier().fit(X_train, y_train) print( 'Accuracy of Decision Tree classifier on training set: {:.2f}'.format( clf.score(X_train, y_train))) print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format( clf.score(X_test, y_test))) clf2 = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train) print( 'Accuracy of Decision Tree classifier on training set: {:.2f}'.format( clf2.score(X_train, y_train))) print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format( clf2.score(X_test, y_test))) plot_decision_tree(clf, iris.feature_names, iris.target_names) plot_decision_tree(clf2, iris.feature_names, iris.target_names) plt.figure(figsize=(10, 4), dpi=80) plot_feature_importances(clf, iris.feature_names) plt.show() print('Feature importances: {}'.format(clf.feature_importances_)) X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0) fig, subaxes = plt.subplots(6, 1, figsize=(6, 32)) pair_list = [[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]] tree_max_depth = 4 for pair, axis in zip(pair_list, subaxes): X = X_train[:, pair] y = y_train clf = DecisionTreeClassifier(max_depth=tree_max_depth).fit(X, y) title = 'Decision Tree, max_depth = {:d}'.format(tree_max_depth) plot_class_regions_for_classifier_subplot(clf, X, y, None, None, title, axis, iris.target_names) axis.set_xlabel(iris.feature_names[pair[0]]) axis.set_ylabel(iris.feature_names[pair[1]]) plt.tight_layout() plt.show() cancer = load_breast_cancer() X_cancer, y_cancer = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state=0) clf = DecisionTreeClassifier(max_depth=4, min_samples_leaf=8, random_state=0).fit(X_train, y_train) plot_decision_tree(clf, cancer.feature_names, cancer.target_names) print('Breast cancer dataset: decision tree') print('Accuracy of DT classifier on training set: {:.2f}'.format( clf.score(X_train, y_train))) print('Accuracy of DT classifier on test set: {:.2f}'.format( clf.score(X_test, y_test))) plt.figure(figsize=(10, 6), dpi=80) plot_feature_importances(clf, cancer.feature_names) plt.tight_layout() plt.show()