# #### Setting max decision tree depth to help avoid overfitting # In[ ]: clf2 = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train) print('Accuracy of Decision Tree classifier on training set: {:.2f}'.format( clf2.score(X_train, y_train))) print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format( clf2.score(X_test, y_test))) # #### Visualizing decision trees # In[ ]: plot_decision_tree(clf, iris.feature_names, iris.target_names) # #### Pre-pruned version (max_depth = 3) # In[ ]: plot_decision_tree(clf2, iris.feature_names, iris.target_names) # #### Feature importance # In[ ]: from adspy_shared_utilities import plot_feature_importances plt.figure(figsize=(10, 4), dpi=80) plot_feature_importances(clf, iris.feature_names)
def mess_with_iris(): #from adspy_shared_utilities import plot_feature_importances from adspy_shared_utilities import plot_decision_tree from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn.datasets import load_iris import pandas as pd import numpy as np iris = load_iris() #iris.feature_names#>> ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'] # separate data seplen, sepwid, petlen, petwid = iris.data[:, 0], iris.data[:, 1], iris.data[:, 2], iris.data[:, 3] # turn sklearn.datasets.base.Bunch data into pandas dataframe df = pd.DataFrame(np.c_[iris.data, iris.target], columns=iris['feature_names'] + ['target']) # define new variable of (sepal length / sepal width) seplenwid = seplen / sepwid #seplenwid == (df['sepal length (cm)']/df['sepal width (cm)']).tolist() # make list into dataframe df_seplenwid = pd.DataFrame({'sepal (length/width)': seplenwid}) # merge new dataframes to original dataframe, as columns df = df.join(df_seplenwid) df = df.join( pd.DataFrame({ 'petal (length/width)': (df['petal length (cm)'] / df['petal width (cm)']).tolist() })) # add more columns, comparing sepal and pedal lengths and widths df = df.join( pd.DataFrame({ 'sep.len/pet.wid': (df['sepal length (cm)'] / df['petal width (cm)']).tolist() })) df = df.join( pd.DataFrame({ 'sep.wid/pet.len': (df['sepal width (cm)'] / df['petal length (cm)']).tolist() })) # separate df into data and target, for ML classifier df_no_target = df[[ 'sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'sepal (length/width)', 'petal (length/width)', 'sep.len/pet.wid', 'sep.wid/pet.len' ]] target = np.array(df['target'].tolist()) # split data for ML classifier X_train, X_test, y_train, y_test = train_test_split(df_no_target, target, random_state=0) clf = DecisionTreeClassifier().fit(X_train, y_train) #print('Accuracy of Decision Tree classifier on training set: {:.2f}'.format(clf.score(X_train, y_train))) #print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(clf.score(X_test, y_test))) #>> Accuracy of Decision Tree classifier on training set: 1.00 #>> Accuracy of Decision Tree classifier on test set: 0.97 """clf2 = DecisionTreeClassifier(max_depth = 3).fit(X_train, y_train) #print('Accuracy of Decision Tree classifier on training set: {:.2f}'.format(clf2.score(X_train, y_train))) #print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(clf2.score(X_test, y_test))) #>> Accuracy of Decision Tree classifier on training set: 0.98 #>> Accuracy of Decision Tree classifier on test set: 0.95""" # make list of new feature names, for better classification feature_names_list = df_no_target.keys().tolist() # make bar plot of feature importance def feature_importance(): from adspy_shared_utilities import plot_feature_importances import matplotlib.pyplot as plt ################################################## ### Jupyter Notebooks Needs This Line #%matplotlib notebook ################################################## #plt.clf() plt.figure(figsize=(10, 4), dpi=80) plot_feature_importances(clf, feature_names_list) #print('Feature importances: {}'.format(clf.feature_importances_)) print( 'Feature importances: {:.3f}, {:.3f}, {:.3f}, {:.3f}, {:.3f}, {:.3f}, {:.3f}' .format( clf.feature_importances_[0], clf.feature_importances_[1], clf.feature_importances_[2], clf.feature_importances_[3], clf.feature_importances_[4], clf.feature_importances_[5], clf.feature_importances_[6], )) return plt.show() # call function to make bar plot feature_importance() #return iris.feature_names#>> ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'] return plot_decision_tree(clf, feature_names_list, iris.target_names) #mess_with_iris()
print('Accuracy of Decision Tree classifier on training set: {:.2f}' .format(clf.score(X_train, y_train))) print('Accuracy of Decision Tree classifier on test set: {:.2f}\n' .format(clf.score(X_test, y_test))) # Setting max decision tree depth to help avoid overfitting: clf2 = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train) print('Accuracy of Decision Tree classifier (max decisions = 3) on training set: {:.2f}' .format(clf.score(X_train, y_train))) print('Accuracy of Decision Tree classifier on test set: {:.2f}\n' .format(clf.score(X_test, y_test))) # Visualizing decision trees (TODO: make it display): plt.figure() plot_decision_tree(clf, iris.feature_names, iris.target_names) # (Figure 22) # Color intensity represents which majority class is present in each node. # values section corresponds to how many training instances belong in each class. # Visualizing (pre-pruned version max_depth = 3) plt.figure() plot_decision_tree(clf2, iris.feature_names, iris.target_names) # (Figure 23) # Feature importance from adspy_shared_utilities import plot_feature_importances plt.figure(figsize=(10,4), dpi=80) # Dots per inch plot_feature_importances(clf, iris.feature_names) # (Figure 24) print("Feature importances: {}\n".format(clf.feature_importances_)) # Inherent property of classifier, not user-defined property #from sklearn.tree import DecisionTreeClassifier
print('Breast cancer dataset: decision tree') print('Accuracy of DT classifier on training set: {:.2f}' .format(clf.score(X_train, y_train))) print('Accuracy of DT classifier on test set: {:.2f}' .format(clf.score(X_test, y_test))) print('test score of the model :'+str(clf.score(X_test,y_test))) #5.Tuning the model. clf = DecisionTreeClassifier(random_state = 0) parameters = { 'max_depth': range(1,12), 'criterion' : ('gini', 'entropy'), 'max_features' : ('auto', 'sqrt', 'log2'), 'min_samples_leaf' : (2,4,6,8,10,12), 'min_samples_split' : (2,4,6,8,10,12) } DT_grid = RandomizedSearchCV(clf, param_distributions = parameters, cv =5, verbose = True) DT_grid.fit(X_train,y_train) #5.Evaluating the score of thw model. print('train score of the model :'+str(DT_grid.score(X_train,y_train))) print('test score of the model :'+str(DT_grid.score(X_test,y_test))) print(DT_grid.best_estimator_) plot_decision_tree(DT_grid.best_estimator_, cancer.feature_names, cancer.target_names)
def decisiontree(): iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=3) clf = DecisionTreeClassifier().fit(X_train, y_train) print( 'Accuracy of Decision Tree classifier on training set: {:.2f}'.format( clf.score(X_train, y_train))) print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format( clf.score(X_test, y_test))) clf2 = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train) print( 'Accuracy of Decision Tree classifier on training set: {:.2f}'.format( clf2.score(X_train, y_train))) print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format( clf2.score(X_test, y_test))) plot_decision_tree(clf, iris.feature_names, iris.target_names) plot_decision_tree(clf2, iris.feature_names, iris.target_names) plt.figure(figsize=(10, 4), dpi=80) plot_feature_importances(clf, iris.feature_names) plt.show() print('Feature importances: {}'.format(clf.feature_importances_)) X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0) fig, subaxes = plt.subplots(6, 1, figsize=(6, 32)) pair_list = [[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]] tree_max_depth = 4 for pair, axis in zip(pair_list, subaxes): X = X_train[:, pair] y = y_train clf = DecisionTreeClassifier(max_depth=tree_max_depth).fit(X, y) title = 'Decision Tree, max_depth = {:d}'.format(tree_max_depth) plot_class_regions_for_classifier_subplot(clf, X, y, None, None, title, axis, iris.target_names) axis.set_xlabel(iris.feature_names[pair[0]]) axis.set_ylabel(iris.feature_names[pair[1]]) plt.tight_layout() plt.show() cancer = load_breast_cancer() X_cancer, y_cancer = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state=0) clf = DecisionTreeClassifier(max_depth=4, min_samples_leaf=8, random_state=0).fit(X_train, y_train) plot_decision_tree(clf, cancer.feature_names, cancer.target_names) print('Breast cancer dataset: decision tree') print('Accuracy of DT classifier on training set: {:.2f}'.format( clf.score(X_train, y_train))) print('Accuracy of DT classifier on test set: {:.2f}'.format( clf.score(X_test, y_test))) plt.figure(figsize=(10, 6), dpi=80) plot_feature_importances(clf, cancer.feature_names) plt.tight_layout() plt.show()