def decisionTreeExamplePlot(): depths = np.arange(1, 20) train_accuracy = np.empty(len(depths)) test_accuracy = np.empty(len(depths)) for i, k in enumerate(depths): dt = DecisionTreeClassifier(max_depth=k, random_state=12) path = dt.cost_complexity_pruning_path(X_train, Y_train) ccp_alphas, impurities = path.ccp_alphas, path.impurities dt = DecisionTreeClassifier(ccp_alpha=ccp_alphas[int( len(ccp_alphas) * 24 / 25)], random_state=12) dt.fit(X_train, Y_train) train_accuracy[i] = dt.score(X_train, Y_train) test_accuracy[i] = dt.score(X_test, Y_test) plt.plot(depths, test_accuracy, label='Decision Tree Testing dataset Accuracy') plt.plot(depths, train_accuracy, label='Decision Tree Training dataset Accuracy') plt.legend() plt.xlabel('Max Depth') plt.ylabel('Accuracy') plt.show()
def decision_tree(): clf = DecisionTreeClassifier(random_state=0) path = clf.cost_complexity_pruning_path(X_train, y_train) ccp_alphas, impurities = path.ccp_alphas, path.impurities clfs = [] for ccp_alpha in ccp_alphas: clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha) clf.fit(X_train, y_train) clfs.append(clf) clfs = clfs[:-1] ccp_alphas = ccp_alphas[:-1] train_scores = [clf.score(X_train, y_train) for clf in clfs] test_scores = [clf.score(X_test, y_test) for clf in clfs] fig, ax = plt.subplots() ax.set_xlabel("alpha") ax.set_ylabel("accuracy") ax.set_title("Accuracy vs alpha for training and testing sets") ax.plot(ccp_alphas, train_scores, marker='o', label="train", drawstyle="steps-post") ax.plot(ccp_alphas, test_scores, marker='o', label="test", drawstyle="steps-post") ax.legend() plt.show()
def explore_tree_pruning(self): labels = self.data[self.target] X_train, X_test, y_train, y_test = train_test_split( self.data[self.feature_names], labels, stratify=labels) tree_prune = DecisionTreeClassifier(random_state=0, criterion='entropy') alphas = tree_prune.cost_complexity_pruning_path(X_train, y_train).ccp_alphas trees = [] for alpha in alphas: trees.append( DecisionTreeClassifier(random_state=0, criterion='entropy', ccp_alpha=alpha).fit(X_train, y_train)) train_scores = [i.score(X_train, y_train) for i in trees] test_scores = [i.score(X_test, y_test) for i in trees] fig, ax = plt.subplots() ax.set_xlabel('alpha') ax.set_ylabel('accuracy') ax.set_title('accuracy vs. alpha') ax.plot(alphas, train_scores, marker='o', label='train', drawstyle='steps-post') ax.plot(alphas, test_scores, marker='o', label='test', drawstyle='steps-post') ax.legend() plt.show()
def decision_tree(X_train, X_test, y_train, y_test, criterion, max_features): test_scores, training_scores = [], [] trees, nodes = [], [] clf = DecisionTreeClassifier(random_state=0) path = clf.cost_complexity_pruning_path(X_train, y_train) ccp_alphas, impurities = path.ccp_alphas, path.impurities for ccp_alpha in ccp_alphas: clf = DecisionTreeClassifier(ccp_alpha=ccp_alpha, criterion=criterion, max_features=max_features) clf = clf.fit(X_train, y_train) y_train_prediction = clf.predict(X_train) y_test_prediction = clf.predict(X_test) if clf.tree_.node_count not in nodes: training_scores.append( metrics.accuracy_score(y_train, y_train_prediction)) test_scores.append( metrics.accuracy_score(y_test, y_test_prediction)) trees.append(clf) nodes.append(clf.tree_.node_count) return max(test_scores)
def plot_decision_tree_alphas(X, y): # split data set into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=667) # create decision tree classifier tree = DecisionTreeClassifier(random_state=667) # do cost complexity pruning path = tree.cost_complexity_pruning_path(X_train, y_train) alphas = path.ccp_alphas alphas = alphas[:-1] train_acc, test_acc = [], [] # create a decision tree for each of our alpha values and store the training and testing accuracies for alpha in alphas: tree = DecisionTreeClassifier(random_state=667, ccp_alpha=alpha) tree = tree.fit(X_train, y_train) y_train_pred = tree.predict(X_train) y_test_pred = tree.predict(X_test) train_acc.append(accuracy_score(y_train, y_train_pred)) test_acc.append(accuracy_score(y_test, y_test_pred)) # graphically plot the accuracies of the trees using the training and testing datasets fig, ax = plt.subplots() ax.set_xlabel('alpha') ax.set_ylabel('accuracy') ax.set_title('Accuracy vs alpha for training and testing sets') ax.plot(alphas, train_acc, marker='o', label='train', drawstyle='steps-post') ax.plot(alphas, test_acc, marker='o', label='test', drawstyle='steps-post') ax.legend() plt.show()
class DecisionTreeModel: # initialize a DecisionTreeModel object with "model" attribute containing an actual DecisionTreeClassifier object from the skLearn module def __init__(self,*args,**kwargs): self.model = DecisionTreeClassifier(*args, **kwargs) def get_model(self): return self.model def apply(self,X,check_input=True): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) return self.model.apply(X,check_input) def cost_complexity_pruning_path(self,X,y,sample_weight=None): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) if (isinstance(y,TabularData)): y=DataConversion.extract_y(y) return self.model.cost_complexity_pruning_path(X,y,sample_weight) def decision_path(self,X,check_input=True): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) return self.model.decision_path(X,check_input) def fit(self,X,y,sample_weight=None,check_input=True,X_idx_sorted=None): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) if (isinstance(y,TabularData)): y=DataConversion.extract_y(y) self.model.fit(X,y,sample_weight,check_input,X_idx_sorted) return self def predict(self,X,check_input=True): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) return self.model.predict(X,check_input) def predict_log_proba(self,X): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) return self.model.predict_log_proba(X) def predict_proba(self,X,check_input=True): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) return self.model.predict_proba(X,check_input) def score(self,X,y,sample_weight=None): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) if (isinstance(y,TabularData)): y=DataConversion.extract_y(y) return self.model.score(X,y,sample_weight) def __getattribute__(self,item): try: return super().__getattribute__(item) except: pass; return getattr(self.model,item)
def decisionTree(data): # find the best alpha for pruning X_train, X_test, y_train, y_test = train_test_split(data[0], data[1], test_size=0.5, random_state=0) clf = DecisionTreeClassifier(random_state=0) path = clf.cost_complexity_pruning_path(X_train, y_train) ccp_alphas = path.ccp_alphas[:-2] clfs = [] for ccp_alpha in ccp_alphas: clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha) clf.fit(X_train, y_train) clfs.append(clf) train_scores = [clf.score(X_train, y_train) for clf in clfs] test_scores = [clf.score(X_test, y_test) for clf in clfs] # plot fig, ax = plt.subplots() ax.set_xlabel("Alpha") ax.set_ylabel("Accuracy") ax.set_title("Accuracy vs Alpha") ax.plot(ccp_alphas, train_scores, marker='o', label="train") ax.plot(ccp_alphas, test_scores, marker='o', label="test") ax.legend() plt.show() # best alpha ccp_alpha = ccp_alphas[test_scores.index(max(test_scores))] print('Alpha: %2f' % ccp_alpha) # train with various training size train_size = [0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8] train_scores, test_scores = [], [] for ts in train_size: tmp1, tmp2 = [], [] for state in [0,1,2,3,4,5,6,7,8,9, 10,11, 12, 13, 14, 100, 1000]: # split train and test set X_train, X_test, y_train, y_test = train_test_split(data[0], data[1], test_size=1-ts, random_state=state) # train and evaluate model clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha) res_train, res_test = evaluation(clf, X_train, X_test, y_train, y_test) tmp1.append(res_train) tmp2.append(res_test) train_scores.append(sum(tmp1)/len(tmp1)) test_scores.append(sum(tmp2)/len(tmp2)) # plot fig, ax = plt.subplots() ax.set_xlabel("Training size %") ax.set_ylabel("Accuracy") ax.set_title("Accuracy vs Training size") ax.plot(train_size, train_scores, marker='o', label="train") ax.plot(train_size, test_scores, marker='o', label="test") ax.legend() plt.show() print(print('Overall accuracy: %2f' % max(test_scores)))
def calculate(): df = pd.read_csv('iris.csv') df['petal.width'].plot.hist() plt.show() sns.pairplot(df, hue='species') plt.show() all_inputs = df[[ 'sepal.length', 'sepal.width', 'petal.length', 'petal.width' ]].values all_classes = df['species'].values (train_inputs, test_inputs, train_classes, test_classes) = train_test_split(all_inputs, all_classes, train_size=0.7, random_state=1) dtc = DecisionTreeClassifier(random_state=0, ccp_alpha=0.01) dtc.fit(train_inputs, train_classes) print("Score is " + str(dtc.score(test_inputs, test_classes))) clf = SVC(random_state=0) clf.fit(train_inputs, train_classes) plot_confusion_matrix(clf, test_inputs, test_classes) plt.show() # from the graphic we can see: # setosa - no mistakes # versicolor - no mistakes # virginica was confused with versicolor 1 time # pruning path = dtc.cost_complexity_pruning_path(train_inputs, train_classes) print(path) ccp_alphas, impurities = path.ccp_alphas, path.impurities plt.figure(figsize=(10, 6)) plt.plot(ccp_alphas, impurities) plt.xlabel("effective alpha") plt.ylabel("") plt.show() clfs = [] for ccp_alpha in ccp_alphas: dtc = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha) dtc.fit(train_inputs, train_classes) clfs.append(dtc) acc_scores = [ accuracy_score(test_classes, clf.predict(test_inputs)) for clf in clfs ] tree_depths = [clf.tree_.max_depth for clf in clfs] plt.figure(figsize=(10, 6)) plt.grid() plt.plot(ccp_alphas[:-1], acc_scores[:-1]) plt.xlabel("effective alpha") plt.ylabel("Accuracy scores") plt.show()
def decision_tree_post_pruning(X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) clf = DecisionTreeClassifier(random_state=0) path = clf.cost_complexity_pruning_path(X_train, y_train) ccp_alphas, impurities = path.ccp_alphas, path.impurities clfs = [] for ccp_alpha in ccp_alphas: clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha) clf.fit(X_train, y_train) clfs.append(clf) print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format( clfs[-1].tree_.node_count, ccp_alphas[-1])) clfs = clfs[:-1] ccp_alphas = ccp_alphas[:-1] node_counts = [clf.tree_.node_count for clf in clfs] depth = [clf.tree_.max_depth for clf in clfs] fig, ax = plt.subplots(2, 1) ax[0].plot(ccp_alphas, node_counts, marker='o', drawstyle="steps-post") ax[0].set_xlabel("alpha") ax[0].set_ylabel("number of nodes") ax[0].set_title("Number of nodes vs alpha") ax[1].plot(ccp_alphas, depth, marker='o', drawstyle="steps-post") ax[1].set_xlabel("alpha") ax[1].set_ylabel("depth of tree") ax[1].set_title("Depth vs alpha") fig.tight_layout() train_scores = [clf.score(X_train, y_train) for clf in clfs] test_scores = [clf.score(X_test, y_test) for clf in clfs] fig, ax = plt.subplots() ax.set_xlabel("alpha") ax.set_ylabel("accuracy") ax.set_title("Accuracy vs alpha for training and testing sets") ax.plot(ccp_alphas, train_scores, marker='o', label="train", drawstyle="steps-post") ax.plot(ccp_alphas, test_scores, marker='o', label="test", drawstyle="steps-post") ax.legend() plt.show()
def tree(x_train, x_test, y_train, y_test, features, criterion, test_split, dataset): test_accuracies = [] training_accuracies = [] trees = [] nodes = [] clf = DecisionTreeClassifier(random_state=0) path = clf.cost_complexity_pruning_path(x_train, y_train) alphas, impurities = path.ccp_alphas, path.impurities for alpha in alphas: clf = DecisionTreeClassifier(ccp_alpha=alpha, criterion=criterion, max_features=features) clf = clf.fit(x_train, y_train) y_pred = clf.predict(x_test) y_train_pred = clf.predict(x_train) if clf.tree_.node_count not in nodes: training_accuracies.append( metrics.accuracy_score(y_train, y_train_pred)) test_accuracies.append(metrics.accuracy_score(y_test, y_pred)) trees.append(clf) nodes.append(clf.tree_.node_count) # Below this line are the most accurate decision trees. # if dataset == "Cleveland" and criterion == 'gini' and features == 5 and test_split == 0.1: # plt.title("Cleveland Test Data and Training Data Accuracy") # plt.xlabel("# of Nodes") # plt.ylabel("Test Data Accuracy (Red) and Training Data Accuracy (Blue)") # plt.plot(nodes, test_accuracies, 'r') # plt.plot(nodes, training_accuracies, 'b') # plt.savefig('ClevelandAccuracies.png') # if dataset == "Banknote" and criterion == 'entropy' and features == 1 and test_split == 0.2: # plt.title("Banknote Test Data and Training Data Accuracy") # plt.xlabel("# of Nodes") # plt.ylabel("Test (Green) Accuracy and Training (Yellow) Accuracy") # plt.plot(nodes, test_accuracies, 'g') # plt.plot(nodes, training_accuracies, 'y') # plt.savefig('BanknoteAccuracies.png') max_value = max(test_accuracies) return max_value
def correlation_with_dependent(X, y, file): # TODO find variables that contain all of the mortstat=1 values but reduce the sample size # One way run a one-level decision tree for each column in X, and chose the variable and cut point # with the lowest class==0 impurity -- i.e. all the class==1 nodes are in one leaf """ cor = X.corrwith(y, axis=0) print(type(cor)) cor = cor.rename_axis('correlation') cor = cor.sort_values() print(cor) """ 'class_weight={0: 0.9, 1: 0.1}, ' dir = "correlations/" tree_clf = DecisionTreeClassifier( criterion='gini', max_depth=1, min_samples_split=2, max_features=len(X.columns), class_weight={ 0: 0.1, 1: 0.9 }, ) tree_clf.fit(X, y) pred = tree_clf.predict(X) # Confusion matrix and classification report print("Correlation with dependent tree") print(confusion_matrix(y, pred)) print(classification_report(y, pred)) y_score = tree_clf.score(X, y) print('Accuracy: ', y_score) micro_precision = precision_score(pred, y, average='micro') print('Micro-averaged precision score: {0:0.2f}'.format(micro_precision)) macro_precision = precision_score(pred, y, average='macro') print('Macro-averaged precision score: {0:0.2f}'.format(macro_precision)) per_class_precision = precision_score(pred, y, average=None) print('Per-class precision score:', per_class_precision) path = tree_clf.cost_complexity_pruning_path(X, y) ccp_alphas, impurities = path.ccp_alphas, path.impurities print(f"Impurities: {path.impurities}") create_png(dir + file, X, tree_clf)
def prune_tree_model(train_features, train_labels, test_features, test_labels): clf = DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_split=5, min_samples_leaf=10) path = clf.cost_complexity_pruning_path(train_features, train_labels) ccp_alphas, impurities = path.ccp_alphas, path.impurities clfs = [] best_clf = [] best_score = 0.0 for ccp_alpha in ccp_alphas: clf = DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_split=5, min_samples_leaf=10, random_state=0, ccp_alpha=ccp_alpha) clf.fit(train_features, train_labels) clfs.append(clf) acc_score = clf.score(test_features, test_labels) print("Score: " + str(acc_score)) if (acc_score > best_score): best_clf = [clf] return best_clf[-1].ccp_alpha
def DecisionTreeClassifier(self): """ Find the best model (hyperparameters optimization) in the familiy of decision trees Return ------ best_nsc : best nescience achieved best_model : a trained DecisionTreeClassifer best_viu : None, since all the variables are used as input """ # We restrict ourselves to at least 5 samples per leave, # otherwise the algorithm could take too much time to converge, # Anyway, the limit of 5 is considered a good practice in ML clf = DecisionTreeClassifier(min_samples_leaf=5) # Compute prunning points path = clf.cost_complexity_pruning_path(self.X_, self.y_) previous_nodes = -1 best_nsc = 1 best_model = None # For every possible prunning point in reverse order for ccp_alpha in reversed(path.ccp_alphas): model = DecisionTreeClassifier(ccp_alpha=ccp_alpha, min_samples_leaf=5, random_state=self.random_state) model.fit(self.X_, self.y_) # Skip evaluation if nothing has changed if model.tree_.node_count == previous_nodes: continue previous_nodes = model.tree_.node_count new_nsc = self.nescience_.nescience(model) if new_nsc < best_nsc: best_nsc = new_nsc best_model = model else: if self.fast: # Early stop break return (best_nsc, best_model, None)
def post_pruning(X, y): """Minimal-complexity post-pruning for large decision trees. Given data set (X,y), train a decision tree classifier and compute the ccp_alphas from possible pruning paths. Do cross-validation with 5 folds and use one-standard-error rule to get the most parsimonous tree. Parameters ---------- X: Input features y: Labels Returns ------- ccp_alpha: Selected best alpha a* """ # https://medium.com/swlh/post-pruning-decision-trees-using-python-b5d4bcda8e23 # https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html#sphx-glr-auto-examples-tree-plot-cost-complexity-pruning-py clf = DecisionTreeClassifier(random_state=42) path = clf.cost_complexity_pruning_path(X, y) ccp_alphas, impurities = path.ccp_alphas, path.impurities ccp_alphas = ccp_alphas[:-1] scores = [] if len(ccp_alphas) != 0: for ccp_alpha in ccp_alphas: clf = DecisionTreeClassifier(ccp_alpha=ccp_alpha) score = cross_val_score(clf, X, y, cv=5, scoring="neg_mean_squared_error", n_jobs=-1) scores.append(score) # average over folds, fix sign of mse fold_mse = -np.mean(scores, 1) # select the most parsimonous model (highest ccp_alpha) that has an error within one standard deviation of # the minimum mse. # I.e. the "one-standard-error" rule (see ESL or a lot of other tibshirani / hastie notes on regularization) selected_alpha = np.max( ccp_alphas[fold_mse <= np.min(fold_mse) + np.std(fold_mse)]) return selected_alpha else: return 0.0
def get_cpp_alphas(self, data): clf = DecisionTreeClassifier(random_state=0) path = clf.cost_complexity_pruning_path(data.X_train, data.y_train) path_df = pd.DataFrame(path) path_df[:-1].plot( x="ccp_alphas", y="impurities", style=".-", legend=False, title=f"Impurities vs ccp_alphas on {data.data_name} data") plt.ylabel("total impurity of leaves") plt.savefig(os.path.join("output", f"{data.data_name}_ccp_alphas.png")) plt.clf() self.ccp_alphas = np.linspace( 0, path_df[path_df["impurities"] < 0.5]["ccp_alphas"].iloc[-2], 100) res_dict = { "cpp_alphas": [], "node_count": [], "max_depth": [], "n_leaves": [], } for a in self.ccp_alphas: clf = DecisionTreeClassifier(random_state=0, ccp_alpha=a) clf.fit(data.X_train, data.y_train) res_dict["cpp_alphas"].append(a) res_dict["node_count"].append(clf.tree_.node_count) res_dict["max_depth"].append(clf.tree_.max_depth) res_dict["n_leaves"].append(clf.tree_.n_leaves) res_df = pd.DataFrame(res_dict) res_df = res_df.set_index("cpp_alphas") res_df.plot( subplots=True, style=".-", figsize=(10, 5), title= f"Decision tree complexity vs ccp_alpha on {data.data_name} data", ) plt.savefig( os.path.join("output", f"{data.data_name}_ccp_alphas_tree.png")) plt.clf()
def analyze_ccp_alpha(X_train, X_test, y_train, y_test): """ Analyses the relationship between the ccp_alphas parameter and accuracy for Decision Trees. This function was created to perform analysis on the ccp_alpha parameter to see if tuning it improved the decision tree. It is not used in the normal running of the code as it takes much too long to complete. It is present for completeness only. Code derived from: https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html Args: X_train (array): The training documents X_test (array): the testing documents y_train (array): the training labels y_test (array): the testing labels """ classifier = DecisionTreeClassifier(random_state=0) path = classifier.cost_complexity_pruning_path(X_train, y_train) ccp_alphas, impurities = path.ccp_alphas, path.impurities classifiers = [] for ccp_alpha in ccp_alphas: clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha) clf.fit(X_train, y_train) classifiers.append(clf) train_scores = [clf.score(X_train, y_train) for clf in classifiers] test_scores = [clf.score(X_test, y_test) for clf in classifiers] fig, ax = plt.subplots() ax.set_xlabel("Alpha") ax.set_ylabel("Accuracy") ax.set_title("Accuracy vs alpha for training and testing sets") ax.plot(ccp_alphas, train_scores, marker='o', label="train", drawstyle="steps-post") ax.plot(ccp_alphas, test_scores, marker='o', label="test", drawstyle="steps-post") ax.legend() plt.show()
def opt_ad(x_train, y_train, x_valid, y_valid): """Obtem o melhor valor de alpha para a poda da AD""" ad = AD() path = ad.cost_complexity_pruning_path(x_train, y_train) alphas = path.ccp_alphas res = np.zeros((len(alphas), 1)) # https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html for i, alpha in enumerate(alphas): ad = AD(random_state=0, ccp_alpha=alpha) ad.fit(x_train, y_train) res[i] = math.sqrt(sk.metrics.mean_squared_error(ad.predict(x_valid), y_valid)) i = res.argmin() print(f'SVM = {res[i]}') print(f':: alpha = {alphas[i]}') return res[i]
# %% # Total impurity of leaves vs effective alphas of pruned tree # --------------------------------------------------------------- # Minimal cost complexity pruning recursively finds the node with the "weakest # link". The weakest link is characterized by an effective alpha, where the # nodes with the smallest effective alpha are pruned first. To get an idea of # what values of ``ccp_alpha`` could be appropriate, scikit-learn provides # :func:`DecisionTreeClassifier.cost_complexity_pruning_path` that returns the # effective alphas and the corresponding total leaf impurities at each step of # the pruning process. As alpha increases, more of the tree is pruned, which # increases the total impurity of its leaves. X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = DecisionTreeClassifier(random_state=0) path = clf.cost_complexity_pruning_path(X_train, y_train) ccp_alphas, impurities = path.ccp_alphas, path.impurities # %% # In the following plot, the maximum effective alpha value is removed, because # it is the trivial tree with only one node. fig, ax = plt.subplots() ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post") ax.set_xlabel("effective alpha") ax.set_ylabel("total impurity of leaves") ax.set_title("Total Impurity vs effective alpha for training set") # %% # Next, we train a decision tree using the effective alphas. The last value # in ``ccp_alphas`` is the alpha value that prunes the whole tree, # leaving the tree, ``clfs[-1]``, with one node.
def decision_tree_experiment(dataset, hparams, output_fn_base): logs = [] metrics_dictionary = {} print("----Running Decision Tree Experiment-----") print("Hyperparameters Used: ") print(hparams) X = dataset["features"] y = dataset["class"] X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=.2, train_size=.8) # tree_clf = DecisionTreeClassifier() # # # grid search (comment out in production) # parameter_space = { # 'max_depth': list(range(hparams["known_max_depth"]+2)[1:]), # "ccp_alpha": [.001,.1, .2,.3,.4,.5,.6,.7,.8,.9, 1, 1.1, 1.2, 1.3, 1.4, 1.5], # 'criterion': ['gini', 'entropy'] # } # baseclf = DecisionTreeClassifier() # clf = GridSearchCV(baseclf, parameter_space, n_jobs=1, cv=3) # clf.fit(X_train, y_train) # print('Best parameters found:\n', clf.best_params_) tree_clf = DecisionTreeClassifier(max_depth=hparams["max_depth"], splitter=hparams["splitter"], criterion=hparams["criterion"], ccp_alpha=hparams["ccp_alpha"]) # learning curve cv = ShuffleSplit(n_splits=100, test_size=0.2) train_scores, train_sizes, validation_scores, fit_times = plot_learning_curve2( tree_clf, "%s/DT/Curve_%s" % (output_fn_base, output_fn_base), "Unboosted DT Learning Curve for %s" % output_fn_base, X, y, ylim=None, cv=cv, n_jobs=4) metrics_dictionary["train_scores"] = train_scores metrics_dictionary["train_sizes"] = train_sizes metrics_dictionary["validation_scores"] = validation_scores metrics_dictionary["fit_times"] = fit_times tree_clf.fit(X=X_train, y=y_train) runtimes = get_runtime_avgs(tree_clf, X_test) # confusion matrix matrix = create_confusion_matrix( tree_clf, X, y, 10, "%s/DT/Confusion_%s" % (output_fn_base, output_fn_base)) # let's show the actual tree export_graphviz(tree_clf, out_file=("%s/DT/%s.dot" % (output_fn_base, output_fn_base)), feature_names=dataset["features"].columns, rounded=True, filled=True) (graph, ) = pydot.graph_from_dot_file('%s/DT/%s.dot' % (output_fn_base, output_fn_base)) graph.write_png('%s/DT/%s.png' % (output_fn_base, output_fn_base)) # classification report: logs.append(classification_report(y_test, tree_clf.predict(X_test))) # cross validate: cvd = cross_validate( tree_clf, X.values, y.values, cv=10, scoring={ 'accuracy:': make_scorer(accuracy_score), 'precision': make_scorer(precision_score, average='micro'), 'recall': make_scorer(recall_score, average='micro'), 'f1_score': make_scorer(f1_score, average='micro') }, ) metrics_dictionary["cvd"] = cvd # now run experiments for the same metric but relative to hyperparameters # pruning is done with min_samples leaf, max_depth, ccp_alpha # info on ccp_alpha: https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html ################################## # CCP_ALPHA path = tree_clf.cost_complexity_pruning_path(X_train, y_train) ccp_alphas, impurities = path.ccp_alphas, path.impurities ccp_alphas = list(ccp_alphas) ccp_alphas.sort() clfs = [] for ccp_alpha in ccp_alphas: clf = DecisionTreeClassifier(ccp_alpha=ccp_alpha) clf.fit(X_train, y_train) clfs.append(clf) print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format( clfs[-1].tree_.node_count, ccp_alphas[-1])) train_scores = [clf.score(X_train, y_train) for clf in clfs] test_scores = [clf.score(X_test, y_test) for clf in clfs] fig, ax = plt.subplots() ax.plot(ccp_alphas[:-1], impurities[:-1]) ax.set_xlabel("effective alpha") ax.set_ylabel("total impurity of leaves") ax.set_title("Total Impurity vs effective alpha for training set") plt.savefig("%s/DT/IMPURITYVALPHA_%s" % (output_fn_base, output_fn_base)) plt.close() fig, ax = plt.subplots() ax.set_xlabel("alpha") ax.set_ylabel("accuracy") ax.set_title("Alpha Impact on Score - DT For %s" % output_fn_base) ax.plot(ccp_alphas, train_scores, label="In-Sample") ax.plot(ccp_alphas, test_scores, label="Out-Of-Sample") ax.legend() plt.savefig("%s/DT/CCPALPHA_%s" % (output_fn_base, output_fn_base)) plt.close() ################################## # Max_depth max_depths = list(range(hparams["known_max_depth"] + 2)[1:]) clfs = [] for depth in max_depths: clf = DecisionTreeClassifier(max_depth=depth) clf.fit(X_train, y_train) clfs.append(clf) print("Number of nodes in the last tree is: {} with max_depth: {}".format( clfs[-1].tree_.node_count, max_depths[-1])) train_scores = [clf.score(X_train, y_train) for clf in clfs] test_scores = [clf.score(X_test, y_test) for clf in clfs] fig, ax = plt.subplots() ax.set_xlabel("max_depth") ax.set_ylabel("accuracy") ax.set_title("Max Depth Impact on Score - DT For %s" % output_fn_base) ax.plot(max_depths, train_scores, label="In-Sample") ax.plot(max_depths, test_scores, label="Out-Of-Sample") ax.legend() plt.savefig("%s/DT/DEPTH_%s" % (output_fn_base, output_fn_base)) plt.close() ################################## # min_samples_leaf min_samples_leaf_params = list( range(1, hparams["known_max_min_leaf"], 1)[1:]) clfs = [] for leaf_size in min_samples_leaf_params: clf = DecisionTreeClassifier(min_samples_leaf=leaf_size) clf.fit(X_train, y_train) clfs.append(clf) print("Number of nodes in the last tree is: {} with min_samples_leaf: {}". format(clfs[-1].tree_.node_count, min_samples_leaf_params[-1])) train_scores = [clf.score(X_train, y_train) for clf in clfs] test_scores = [clf.score(X_test, y_test) for clf in clfs] fig, ax = plt.subplots() ax.set_xlabel("min_samples_leaf") ax.set_ylabel("accuracy") ax.set_title("Min Leaf Samples Impact on Score - DT For %s" % output_fn_base) ax.plot(min_samples_leaf_params, train_scores, label="In-Sample") ax.plot(min_samples_leaf_params, test_scores, label="Out-Of-Sample") ax.legend() plt.savefig("%s/DT/LEAF_%s" % (output_fn_base, output_fn_base)) plt.close() logs.append("\tHyperparameters: \n") logs.append("\t%s" % str(hparams)) logs.append("\n\n\tConfusion Matrix: \n") logs.append("\t%s\n" % str(matrix).replace("\n", "\n\t")) logs.append("\n\tMean Accuracy %.05f" % cvd["test_precision"].mean()) logs.append("\n\tMean Precision Score of positive examples %.05f" % cvd["test_precision"].mean()) logs.append("\n\tMean Recall Score of positive examples %.05f" % cvd["test_recall"].mean()) logs.append("\n\tF1 Score of positive examples %.05f\n" % cvd["test_f1_score"].mean()) logs.append("\n\tMean Query Time %.05f\n" % runtimes) return logs, metrics_dictionary
def get_best_tree(data, criterion, name="", graph=False): x_train, y_train = data["training_data"], data["training_target"] x_test, y_test = data["test_data"], data["test_target"] clf = DecisionTreeClassifier(random_state=0, criterion=criterion) path = clf.cost_complexity_pruning_path(x_train, y_train) ccp_alphas, impurities = path.ccp_alphas, path.impurities clfs = [] for ccp_alpha in ccp_alphas: clf = DecisionTreeClassifier(random_state=0, criterion=criterion, ccp_alpha=ccp_alpha) clf.fit(x_train, y_train) clfs.append(clf) # Remove tree with only one node clfs = clfs[:-1] ccp_alphas = ccp_alphas[:-1] train_scores = [clf.score(x_train, y_train) for clf in clfs] test_scores = [clf.score(x_test, y_test) for clf in clfs] node_counts = [clf.tree_.node_count for clf in clfs] depth = [clf.tree_.max_depth for clf in clfs] if graph: # Graph num nodes and depth vs alpha fig, ax = plt.subplots(2, 1) ax[0].plot(ccp_alphas, node_counts, marker='o', drawstyle="steps-post") ax[0].set_xlabel("alpha") ax[0].set_ylabel("number of nodes") ax[0].set_title("Number of nodes vs alpha") ax[1].plot(ccp_alphas, depth, marker='o', drawstyle="steps-post") ax[1].set_xlabel("alpha") ax[1].set_ylabel("depth of tree") ax[1].set_title("Depth vs alpha") fig.tight_layout() plot_name = "images/plot0_" + criterion + ".png" if name == "" else "images/" + name + "_plot0_" + criterion + ".png" plt.savefig(plot_name) # Graph accuracy vs alpha fig, ax = plt.subplots() ax.set_xlabel("alpha") ax.set_ylabel("accuracy") ax.set_title("Accuracy vs alpha for training and testing sets") ax.plot(ccp_alphas, train_scores, marker='o', label="train", drawstyle="steps-post") ax.plot(ccp_alphas, test_scores, marker='o', label="test", drawstyle="steps-post") ax.legend() plot_name = "images/plot1_" + criterion + ".png" if name == "" else "images/" + name + "_plot1_" + criterion + ".png" plt.savefig(plot_name) i = test_scores.index(max(test_scores)) return clfs[i], train_scores[i], test_scores[i], node_counts[i]
# in this case, true positive = 74% and true negative is 79%. looking at the # tree itself, it seems that the tree is very complex and might have overfit # the data. in general, decision trees are prone to overfitting because of the # large number of parameters in the tree model # we can prune the tree to minimize overfitting and optimize the model # 1. Cost Complexity Pruning # 2. Cross Validation ############### Model Optimization: Cost Complexity Pruning ################## # we need to find the right value of the puring parameter, alpha. we can plot # alpha as a function of the accuracy of the tree for both the training and # testing data to find the optimum alpha. path = clf_dt.cost_complexity_pruning_path(X_train, y_train) # etermine value for alpha ccp_alphas = path.ccp_alphas # extract different values for alphas ccp_alphas = ccp_alphas[: -1] # exclude the maximum value for alpha. it corresponds to the root node clf_dts = [] # create an empty array to put decission trees # create a decision tree for each value of alpha and store it in clf_dts for ccp_alpha in ccp_alphas: clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha) clf_dt.fit(X_train, y_train) clf_dts.append(clf_dt) # plotting the accuracy as a function of alpha for the training set and the # testing set train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
def main(): username = input("Enter your username\n") password = input("Enter your password\n") cnx = mysql.connector.connect(username=username, password=password, host='localhost', database='internet_traffic') cursor = cnx.cursor(dictionary=True) print("Fetching data from database...") #query the data we want from the database queryString = "select iat_mean, fwd_packets, bwd_packets, duration, label, bytes_per_second, syn_flag_count, rst_flag_count, psh_flag_count, ack_flag_count, urg_flag_count, cwe_flag_count, ece_flag_count, active_time_mean, idle_time_mean from (((((flow inner join flowbytes on flow.id = flowbytes.flow_id) inner join flowflags on flow.id = flowflags.flow_id) inner join flowiat on flow.id = flowiat.flow_id) inner join flowinfo on flow.id = flowinfo.flow_id) inner join flowpackets on flow.id = flowpackets.flow_id) inner join protocol on flow.protocol_id = protocol.id" cursor.execute(queryString) rows = [] for i in cursor: rows.append(i) with open('mining.csv', 'w', newline='') as f: fieldnames = [] for i in cursor.column_names: fieldnames.append(i) writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(rows) print("Data successfully fetched and recorded in csv file...") #import the data dataframe = pd.read_csv("mining.csv", header=0) #used to check if the dataframe loaded the data properly dataframe.columns = [ 'IATMean', 'ForwardPackets', 'BackwardPackets', 'Duration', 'Label', 'BytesPerSecond', 'SYNFlagCount', 'RSTFlagCount', 'PSHFlagCount', 'ACKFlagCount', 'URGFlagCount', 'CWEFlagCount', 'ECEFlagCount', 'ActiveTimeMean', 'IdleTimeMean' ] #display the data types print(dataframe.head()) print(dataframe.dtypes) #print unique values for each column for columnName in dataframe.columns: print(columnName + ":") print(dataframe[columnName].unique()) dataframe = dataframe.fillna({columnName: -1}) #split dataframe into independent and dependent X = dataframe.drop('Label', axis=1).copy() y = dataframe['Label'].copy() #build the preliminary clasification tree X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) clf = DecisionTreeClassifier(random_state=42, max_depth=5) clf = clf.fit(X_train, y_train) # plot the preliminary tree dot_data = StringIO() export_graphviz(clf, filled=True, rounded=True, special_characters=True, feature_names=X.columns, class_names=['BENIGN', 'DDoS'], out_file=dot_data) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_png('preliminary.png') Image(graph.create_png()) #create the confusion matrix for the preliminary decision tree disp = plot_confusion_matrix(clf, X_test, y_test, display_labels=["BENIGN", "DDoS"]) plt.show() #cost complexity pruning #goal is to find the best pruning parameter alpha which controls how much pruning happens path = clf.cost_complexity_pruning_path(X_train, y_train) ccp_alphas = path.ccp_alphas ccp_alphas = ccp_alphas[:-1] clfs = [] #we put decisions trees into here print("Cost Complexity Pruning") for ccp_alpha in ccp_alphas: print("make tree for alpha") clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha, max_depth=5) clf = clf.fit(X_train, y_train) clfs.append(clf) train_scores = [clf.score(X_train, y_train) for clf in clfs] test_scores = [clf.score(X_test, y_test) for clf in clfs] fig, ax = plt.subplots() ax.set_xlabel("alpha") ax.set_ylabel("Accuracy") ax.set_title("Accuracy vs alpha for training and testing sets") ax.plot(ccp_alphas, train_scores, marker='o', label="train", drawstyle="steps-post") ax.plot(ccp_alphas, test_scores, marker='o', label="test", drawstyle="steps-post") ax.legend() plt.show() #there could have been many ways we divide the training and testing dataset #we use 10-fold cross validation to see if we used the best training and testing dataset #i.e one set of data may have a different optimal alpha #demonstrate using a single alpha with different data sets #we see that this alpha is sensitive to the datasets print("Cross validation") clf = DecisionTreeClassifier(random_state=42, ccp_alpha=0.000005, max_depth=5) scores = cross_val_score(clf, X_train, y_train, cv=10) df = pd.DataFrame(data={'tree': range(10), 'accuracy': scores}) df.plot(x='tree', y='accuracy', marker='o', linestyle='--') plt.show() #use cross validation to find optimal value for ccp_alpha alpha_loop_values = [] print("10-fold for more than one alpha") #for each alpha candidate, we run a 10-fold cross validation for ccp_alpha in ccp_alphas: clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha, max_depth=5) scores = cross_val_score(clf, X_train, y_train, cv=10) alpha_loop_values.append([ccp_alpha, np.mean(scores), np.std(scores)]) print("Finished one alpha candidate") #graph the mean and standard deviation of the scores for each candidate alpha alpha_results = pd.DataFrame(alpha_loop_values, columns=['alpha', 'mean_accuracy', 'std']) alpha_results.plot(x='alpha', y='mean_accuracy', yerr='std', marker='o', linestyle='--') plt.show() #this part is used to find the exact optimal alpha value used to create the optimal pruned classification tree print("optimal alpha value") optimal_alpha = alpha_results[(alpha_results['alpha'] > 0) & (alpha_results['alpha'] < 0.0001)] print(optimal_alpha) #optimal pruned tree clf = DecisionTreeClassifier(random_state=42, ccp_alpha=2.247936 * (10**(-10)), max_depth=5) clf = clf.fit(X_train, y_train) dot_data = StringIO() export_graphviz(clf, filled=True, rounded=True, special_characters=True, feature_names=X.columns, class_names=['BENIGN', 'DDoS'], out_file=dot_data) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_png('best.png') Image(graph.create_png()) #draw a confusion matrix for the optimal pruned tree disp = plot_confusion_matrix(clf, X_test, y_test, display_labels=["BENIGN", "DDoS"]) print(disp) plt.show()
class DTLearner(object): def __init__(self, leaf_size=1, n_folds=10, verbose=False): self.leaf_size = leaf_size self.n_folds = n_folds self.cv_scores = [] self.clf = DecisionTreeClassifier() self.predictions = [] self.accuracy_score = 0.0 self.verbose = verbose # NOTE: Add alpha to param_dict, figure out wtf it is self.param_dict = { "criterion": ['gini', 'entropy'], "ccp_alpha": [0, 0.0002, 0.0004, 0.0006, 0.0008, 0.001], "max_depth": range(1, 25), "min_samples_split": range(2, 5), "min_samples_leaf": range(1, 5) } self.grid = 0 # Write data to file for easy analysis self.f = open("dt_info.txt", "a") self.f.write("\n") self.f.write(str(datetime.now())) def train(self, X_train, y_train, flag): ''' :param X_train: training data :param y_train: training labels :return: ''' if self.verbose: print("Training Decision Tree Model...") self.f.write("Training Decision Tree Model...") if flag == 0: # Use cost_complexity_pruning_path to get effective alphas (these is just what values of ccp_alpha could be appropriate) # We can also get corresponding leaf impurities if desired (not needed for now) # https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html path = self.clf.cost_complexity_pruning_path(X_train, y_train) ccp_alphas, impurities = path.ccp_alphas, path.impurities ccp_alphas = ccp_alphas[0::5] clfs = [] for ccp_alpha in ccp_alphas: clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha) clf.fit(X_train, y_train) clfs.append(clf) # print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format( # clfs[-1].tree_.node_count, ccp_alphas[-1])) clfs = clfs[:-1] ccp_alphas = ccp_alphas[:-1] return clfs, ccp_alphas if flag == 1: possible_depths = range(1, 25) clfs = [] for depth in possible_depths: clf = DecisionTreeClassifier(random_state=0, max_depth=depth) clf.fit(X_train, y_train) clfs.append(clf) return clfs, possible_depths if flag == 2: possible_min_samples_leaf = range(1, 20) clfs = [] for min_samples_leaf in possible_min_samples_leaf: clf = DecisionTreeClassifier(random_state=0, min_samples_leaf=min_samples_leaf) clf.fit(X_train, y_train) clfs.append(clf) return clfs, possible_min_samples_leaf if flag == 3: possible_min_samples_split = range(2, 20) clfs = [] for min_samples_split in possible_min_samples_split: clf = DecisionTreeClassifier( random_state=0, min_samples_split=min_samples_split) clf.fit(X_train, y_train) clfs.append(clf) return clfs, possible_min_samples_split def test(self, X_test, X_train, y_test, y_train, clfs, alphas, depths, min_samples_leafs, min_samples_splits, flag): ''' :param X_test: test data :param y_test: test labels :return: ''' if self.verbose: print("Testing Decision Tree Model...") if flag == 0: self.accuracy_score_train = [] self.accuracy_score_test = [] for clf in clfs: predictions_train = clf.predict(X_train) predictions_test = clf.predict(X_test) self.accuracy_score_train.append( accuracy_score(y_train, predictions_train)) self.accuracy_score_test.append( accuracy_score(y_test, predictions_test)) # Print out best Accuracy/Alpha combination print("Best Accuracy Score (Test Validation Set): ", max(self.accuracy_score_test)) print( "Best Alpha (Highest Accuracy, Test Validation Set): ", alphas[self.accuracy_score_test.index( max(self.accuracy_score_test))]) self.f.write("Best Accuracy Score (Test Validation Set): " + str(max(self.accuracy_score_test)) + "\n") self.f.write( "Best Alpha (Highest Accuracy, Test Validation Set): " + str(alphas[self.accuracy_score_test.index( max(self.accuracy_score_test))]) + "\n") plt.figure() plt.plot(alphas, self.accuracy_score_train, label='Accuracy Score (Training Validation Set)') plt.plot(alphas, self.accuracy_score_test, label='Accuracy Score (Test Validation Set)') plt.xlabel('Alpha') plt.ylabel('Accuracy') plt.title('Accuracy vs Alpha Value') plt.legend() plt.savefig( '/Users/ajinkya.bagde/Desktop/AS1_Figs/DT/alpha_vs_accuracy.png' ) return clfs[self.accuracy_score_test.index( max(self.accuracy_score_test))] if flag == 1: self.accuracy_score_train = [] self.accuracy_score_test = [] for clf in clfs: predictions_train = clf.predict(X_train) predictions_test = clf.predict(X_test) self.accuracy_score_train.append( accuracy_score(y_train, predictions_train)) self.accuracy_score_test.append( accuracy_score(y_test, predictions_test)) # Print out best Accuracy/Depth combination print("Best Accuracy Score (Test Validation Set): ", max(self.accuracy_score_test)) print( "Best Depth (Highest Accuracy, Test Validation Set): ", depths[self.accuracy_score_test.index( max(self.accuracy_score_test))]) self.f.write("Best Accuracy Score (Test Validation Set): " + str(max(self.accuracy_score_test)) + "\n") self.f.write( "Best Depth (Highest Accuracy, Test Validation Set): " + str(depths[self.accuracy_score_test.index( max(self.accuracy_score_test))]) + "\n") plt.figure() plt.plot(depths, self.accuracy_score_train, label='Accuracy Score (Training Validation Set)') plt.plot(depths, self.accuracy_score_test, label='Accuracy Score (Test Validation Set)') plt.xlabel('Depth') plt.ylabel('Accuracy') plt.title('Accuracy vs Depth Value') plt.legend() plt.savefig( '/Users/ajinkya.bagde/Desktop/AS1_Figs/DT/depth_vs_accuracy.png' ) return clfs[self.accuracy_score_test.index( max(self.accuracy_score_test))] if flag == 2: self.accuracy_score_train = [] self.accuracy_score_test = [] for clf in clfs: predictions_train = clf.predict(X_train) predictions_test = clf.predict(X_test) self.accuracy_score_train.append( accuracy_score(y_train, predictions_train)) self.accuracy_score_test.append( accuracy_score(y_test, predictions_test)) # Print out best Accuracy/Depth combination print("Best Accuracy Score (Test Validation Set): ", max(self.accuracy_score_test)) print( "Best min_sample_leaf (Highest Accuracy, Test Validation Set): ", min_samples_leafs[self.accuracy_score_test.index( max(self.accuracy_score_test))]) self.f.write("Best Accuracy Score (Test Validation Set): " + str(max(self.accuracy_score_test)) + "\n") self.f.write( "Best min_sample_leaf (Highest Accuracy, Test Validation Set): " + str(min_samples_leafs[self.accuracy_score_test.index( max(self.accuracy_score_test))]) + "\n") plt.figure() plt.plot(min_samples_leafs, self.accuracy_score_train, label='Accuracy Score (Training Validation Set)') plt.plot(min_samples_leafs, self.accuracy_score_test, label='Accuracy Score (Test Validation Set)') plt.xlabel('min_sample_leaf') plt.ylabel('Accuracy') plt.title('Accuracy vs min_sample_leaf Value') plt.legend() plt.savefig( '/Users/ajinkya.bagde/Desktop/AS1_Figs/DT/minsampleleaf_vs_accuracy.png' ) return clfs[self.accuracy_score_test.index( max(self.accuracy_score_test))] if flag == 3: self.accuracy_score_train = [] self.accuracy_score_test = [] for clf in clfs: predictions_train = clf.predict(X_train) predictions_test = clf.predict(X_test) self.accuracy_score_train.append( accuracy_score(y_train, predictions_train)) self.accuracy_score_test.append( accuracy_score(y_test, predictions_test)) # Print out best Accuracy/Depth combination print("Best Accuracy Score (Test Validation Set): ", max(self.accuracy_score_test)) print( "Best min_sample_split (Highest Accuracy, Test Validation Set): ", min_samples_splits[self.accuracy_score_test.index( max(self.accuracy_score_test))]) self.f.write("Best Accuracy Score (Test Validation Set): " + str(max(self.accuracy_score_test)) + "\n") self.f.write( "Best min_sample_split (Highest Accuracy, Test Validation Set): " + str(min_samples_splits[self.accuracy_score_test.index( max(self.accuracy_score_test))]) + "\n") plt.figure() plt.plot(min_samples_splits, self.accuracy_score_train, label='Accuracy Score (Training Validation Set)') plt.plot(min_samples_splits, self.accuracy_score_test, label='Accuracy Score (Test Validation Set)') plt.xlabel('min_sample_split') plt.ylabel('Accuracy') plt.title('Accuracy vs min_sample_split Value') plt.legend() plt.savefig( '/Users/ajinkya.bagde/Desktop/AS1_Figs/DT/minsamplesplit_vs_accuracy.png' ) return clfs[self.accuracy_score_test.index( max(self.accuracy_score_test))] def tune_hyperparameters(self, final_dt, xtrain, ytrain): self.grid = GridSearchCV(final_dt, param_grid=self.param_dict, cv=self.n_folds, verbose=1, n_jobs=-1) self.grid.fit(xtrain, ytrain) self.f.write("Best Params from GridSearchCV: " + str(self.grid.best_params_)) return self.grid.best_params_ def final_test(self, clf, xtest, ytest): prediction_test = clf.predict(xtest) print(accuracy_score(ytest, prediction_test)) self.f.write("Final Accuracy Score (Test Set): " + str(accuracy_score(ytest, prediction_test))) self.f.close()
plot_tree(clsf_des_tree, filled=True, rounded=True, class_names=["No HD", "Yes HD"], feature_names=X_encoded.columns) ## Plot confusion matrix plot_confusion_matrix(clsf_des_tree, X_test, y_test, display_labels=["No HD", "Yes HD"]) ## pruning the tree to fix over fitting issue path = clsf_des_tree.cost_complexity_pruning_path(X_train, y_train) ccp_alphas = path.ccp_alphas # extract different value for alpha ccp_alphas = ccp_alphas[:-1] # exclude the maximum value of alpha clf_dts = [] # Now create one decision tree per value for alpha and store it in the array for ccp_alpha in ccp_alphas: clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha) clf_dt.fit(X_train, y_train) clf_dts.append(clf_dt) train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts] test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]
############################################################################## ############### further steps if using Decision Tree Classifier ############# from sklearn.tree import plot_tree # (training set) plt.figure(figsize=(15, 7.5)) plot_tree(dtc,filled = True, rounded = True, class_names = ['No heart failure','With heart failure'], feature_names = X_train.columns) ############################################################################## # Cost Complexity Pruning - CCP - to avoid overfitting dtc.cost_complexity_pruning_path(X_train, y_train) # remove the impurities pruned = dtc.cost_complexity_pruning_path(X_train, y_train) # values of alpha ccp_alphas = pruned.ccp_alphas # extract different values for alpha ccp_alphas = ccp_alphas[:-1] # excluding max value of alpha (last value) ccp_alphas ''' create one decision tree for each value of alpha and store it in the array dtcs ''' dtcs = [] for ccp_alpha in ccp_alphas: dtc = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha) dtc.fit(X_train, y_train) dtcs.append(dtc) pruned.ccp_alphas
def Alphatest(): col_names = ['residue', 'predus', 'ispred', 'dockpred', 'annotated'] # load dataset df = pd.read_csv( "/Users/evanedelstein/Desktop/Research_Evan/Raji_Summer2019_atom/Data_Files/Logistic_regresion_corrected/noxdata.csv", header=None, names=col_names) #remove null data df.isnull().any() data = df.fillna(method='ffill') # define dependent var columns feature_cols = ['predus', 'ispred', 'dockpred'] protein = data.residue X = data[feature_cols] # Features y = data.annotated # Target variable X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) model = DecisionTreeClassifier(random_state=1).fit(X_train, y_train) y_predicted = model.predict(X_test) print('Training accuracy: ', model.score(X_train, y_train)) print('Test Accuracy: ', model.score(X_test, y_test)) path = model.cost_complexity_pruning_path(X_train, y_train) ccp_alphas, impurities = path.ccp_alphas, path.impurities # fig, ax = plt.subplots() # ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post") # ax.set_xlabel("effective alpha") # ax.set_ylabel("total impurity of leaves") # ax.set_title("Total Impurity vs effective alpha for training set") # # plt.show() clfs = [] for ccp_alpha in ccp_alphas: model = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha) model.fit(X_train, y_train) clfs.append(model) print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format( clfs[-1].tree_.node_count, ccp_alphas[-1])) clfs = clfs[:-1] ccp_alphas = ccp_alphas[:-1] node_counts = [clf.tree_.node_count for clf in clfs] depth = [clf.tree_.max_depth for clf in clfs] train_scores = [clf.score(X_train, y_train) for clf in clfs] test_scores = [clf.score(X_test, y_test) for clf in clfs] fig, ax = plt.subplots() ax.set_xlabel("alpha") ax.set_ylabel("accuracy") ax.set_title("Accuracy vs alpha for training and testing sets") ax.plot(ccp_alphas, train_scores, marker='o', label="train", drawstyle="steps-post") ax.plot(ccp_alphas, test_scores, marker='o', label="test", drawstyle="steps-post") ax.legend() # plt.show() index_best_model = np.argmax(test_scores) best_model = clfs[index_best_model] print('Training accuracy of best model: ', best_model.score(X_train, y_train)) print('Test accuracy of best model: ', best_model.score(X_test, y_test)) print(best_model.get_params()) dot_data = tree.export_graphviz(model, out_file=None) graph = graphviz.Source(dot_data) graph.render("DTtesttree") benchmarkna = pd.read_csv( '/Users/evanedelstein/Desktop/Research_Evan/Raji_Summer2019_atom/Data_Files/Logistic_regresion_corrected/benchmarkdata.csv', header=None, names=col_names) benchmarkna.isnull().any() benchmark = benchmarkna.fillna(method='ffill') X_bench = benchmark[feature_cols] y_bench = benchmark.annotated protienname_bench = benchmark.residue y_predict_best = best_model.predict(X_bench) print(accuracy_score(y_bench, y_predict_best)) model2 = tree.DecisionTreeClassifier(ccp_alpha=0.0010540940654176906) model2.fit(X, y) y_predict = model2.predict(X_bench) # Accurecy print(accuracy_score(y_bench, y_predict))
# create dataframe to store the feature name and their scores # higher score = high significance impf=pd.DataFrame({'features':trainx.columns, 'score':m1.feature_importances_}) # sort the data by scores in decreasing order impf.sort_values('score',ascending=False,inplace=True) # plot the significant features sns.barplot(x=impf.score,y=impf.features) plt.title('Decision Tree - Significant Features') plt.xlabel('Score') plt.ylabel('Features') # Decision Tree pruning dt_path = m1.cost_complexity_pruning_path(trainx,trainy) # cost complexity parameter values ccp_alphas = dt_path.ccp_alphas # find the best ccp_alpha value results = [] for cp in ccp_alphas: m = DecisionTreeClassifier(ccp_alpha=cp).fit(trainx,trainy) results.append(m) # calculate the Accuracy scores for train and test data trg_score = [r.score(trainx,trainy) for r in results] test_score = [r.score(testx,testy) for r in results] # plot the scores
class DecisionTreeModel(BaseLearner): def __init__(self, X_train, X_test, y_train, y_test, pipe, pre_processed_feature_names, class_names, dataset_name): self.model = DecisionTreeClassifier(random_state=1) super().__init__(X_train, X_test, y_train, y_test, pipe, self.model, pre_processed_feature_names, class_names, dataset_name) self.model.fit(self.X_train, self.y_train) self.model_params = {'random_state': 1} def fit(self): super().model.fit(self.X_train, self.X_test) def predict(self, y): super().model.predict(y) ''' Code adapted from: https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html#sphx-glr-auto-examples-tree-plot-cost-complexity-pruning-py ''' def post_prune(self): path = self.model.cost_complexity_pruning_path(self.X_train, self.y_train) ccp_alphas, impurities = path.ccp_alphas, path.impurities title_dic = {'fontsize': 6, 'fontweight': 'bold'} fig, ax = plt.subplots() ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post") ax.set_xlabel("effective alpha", title_dic) ax.set_ylabel("total impurity of leaves", title_dic) ax.set_title("Total Impurity vs effective alpha for training set", title_dic) ax.tick_params(axis="x", labelsize=6) ax.tick_params(axis="y", labelsize=6) ax.yaxis.set_major_formatter(FormatStrFormatter('%.3f')) ax.grid() plt.tight_layout() path = os.path.join(OUTPUT, self.dataset_name, "DecisionTree") filename = "Impurities vs Alpha" + "_" + "DecisionTree" + "_" + self.dataset_name + ".png" filename = os.path.join(path, filename) plt.savefig(filename) clfs = [] for ccp_alpha in ccp_alphas: params = self.model_params params['ccp_alpha'] = ccp_alpha clf = DecisionTreeClassifier(**params) clf.fit(self.X_train, self.y_train) clfs.append(clf) clfs = clfs[:-1] ccp_alphas = ccp_alphas[:-1] node_counts = [clf.tree_.node_count for clf in clfs] depth = [clf.tree_.max_depth for clf in clfs] fig, ax = plt.subplots(2, 1) ax[0].plot(ccp_alphas, node_counts, marker='o', drawstyle="steps-post") ax[0].set_xlabel("alpha", title_dic) ax[0].set_ylabel("number of nodes", title_dic) ax[0].set_title("Number of nodes vs alpha", title_dic) ax[1].plot(ccp_alphas, depth, marker='o', drawstyle="steps-post") ax[1].set_xlabel("alpha", title_dic) ax[1].set_ylabel("depth of tree", title_dic) ax[1].set_title("Depth vs alpha", title_dic) ax[0].grid() ax[1].grid() ax[0].tick_params(axis="x", labelsize=6) ax[0].tick_params(axis="y", labelsize=6) ax[0].yaxis.set_major_formatter(FormatStrFormatter('%.3f')) ax[1].tick_params(axis="x", labelsize=6) ax[1].tick_params(axis="y", labelsize=6) ax[1].yaxis.set_major_formatter(FormatStrFormatter('%.3f')) fig.tight_layout() path = os.path.join(OUTPUT, self.dataset_name, "DecisionTree") filename = "Nodes and Depth vs Alpha" + "_" + "DecisionTree" + "_" + self.dataset_name + ".png" filename = os.path.join(path, filename) plt.savefig(filename) train_scores = [] valid_scores = [] for clf in clfs: cv = cross_validate(clf, self.X_train, self.y_train, scoring='f1', return_train_score=True) train_scores.append(np.mean(cv['train_score'])) valid_scores.append(np.mean(cv['test_score'])) title = "MC Curve for MCC alpha" + "\n" + self.dataset_name title_dic = {'fontsize': 6, 'fontweight': 'bold'} fig, (ax1), = plt.subplots(1, 1, figsize=(3, 2)) ax1.set_title(title, title_dic) ax1.set_ylabel("Mean F1 Score", title_dic) ax1.set_xlabel("alpha", title_dic) ax1.tick_params(axis="x", labelsize=6) ax1.tick_params(axis="y", labelsize=6) ax1.yaxis.set_major_formatter(FormatStrFormatter('%.3f')) ax1.plot(ccp_alphas, train_scores, 'r', linewidth=2, label="train") ax1.plot(ccp_alphas, valid_scores, 'b', linewidth=2, label="cross val") ax1.legend(loc='best', fontsize=6) ax1.grid() plt.tight_layout() path = os.path.join(OUTPUT, self.dataset_name, "DecisionTree") filename = "MC_Curve_" + "alpha" + ".png" filename = os.path.join(path, filename) plt.savefig(filename) def update_and_refit_model(self): self.model = DecisionTreeClassifier(**self.model_params) self.model.fit(self.X_train, self.y_train) def export_graph(self, filename): self.model.fit(self.X_train, self.y_train) plt.figure() plot_tree(self.model, feature_names=self.pre_processed_feature_names, class_names=self.class_names, rounded=True, filled=True, fontsize=4) plt.savefig(filename + ".eps", format='eps', bbox_inches='tight')
def dt(d, id=None): # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, shuffle=True, random_state=SEED) split = StratifiedShuffleSplit(n_splits=1, test_size=.33, random_state=SEED) for i, j in split.split(d, d['y']): train_set = d.loc[i] test_set = d.loc[j] y_train, y_test = train_set['y'], test_set['y'] X_train, X_test = train_set.drop('y', axis=1), test_set.drop('y', axis=1) if id == 'E': scoring = 'f1' else: scoring = 'roc_auc' model_naive = DecisionTreeClassifier() model_naive.fit(X_train, y_train) pred = model_naive.predict(X_test) cv = StratifiedKFold(n_splits=5, random_state=SEED, shuffle=True) print('Accuracy of default model is: ', accuracy_score(y_test, pred)) print('F1 of default model is: ', f1_score(y_test, pred)) print('roc_auc of default is: ', roc_auc_score(y_test, pred)) plot_lc(model_naive, 'Learning Curve - Default Model', X_train, y_train, cv=cv, n_jobs=-1, path='figures/DT_{}_Default_learning.png'.format(id)) plt.clf() print('plot saved') model = DecisionTreeClassifier(random_state=SEED) criterion = ['gini', 'entropy'] max_depth = np.arange(1, 50) ccp_alpha = [.005, .003, .002, .001] grid = dict(criterion=criterion, max_depth=max_depth, ccp_alpha=ccp_alpha) # cv = KFold(n_splits=3, random_state=SEED, shuffle=True) out = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring=scoring, error_score=0) result = out.fit(X_train, y_train) best_model = result.best_estimator_ best_params = result.best_params_ print("Best {} {} using params {}".format(scoring, result.best_score_, best_params)) # ccpAlphas path = model.cost_complexity_pruning_path(X_train, y_train) ccp_alphas = path.ccp_alphas[:-1] # learning_curve plot_lc(best_model, 'Learning Curve - Best Model', X_train, y_train, cv=cv, n_jobs=-1, path='figures/DT_{}_Best_learning.png'.format(id)) plt.clf() # validation curve param_range = np.linspace(ccp_alphas[0], ccp_alphas[-1], num=20) plot_vc(best_model, X_train, y_train, 'Validation Curve', 'ccp_alpha', 'ccp_alphas', param_range, scoring, cv=cv, path='figures/DT_{}_Best_valid.png'.format(id)) plt.clf() plot_roc_curve(best_model, X_test, y_test) plt.savefig('figures/DT_{}_ROC.png'.format(id)) plt.clf() best_model.fit(X_train, y_train) pred = best_model.predict(X_test) accuracy = accuracy_score(y_test, pred) c_matrix = confusion_matrix(y_test, pred) c_report = classification_report(y_test, pred) print('Best Accuracy: ', accuracy) print('confusion_matrix: \n', c_matrix) print('classification_report: \n', c_report) plot_confusion_matrix(best_model, X_test, y_test, normalize='pred') plt.savefig("figures/DT_{}_Confusion.png".format(id)) plt.clf()
out_file=None, feature_names=col_name, class_names=['infected', 'noninfected'], filled=True, rounded=True, special_characters=True) graph = graphviz.Source(graph_data) graph.format = 'png' graph.render("COVID_19_DTpruned_py", view=True) # %% print(*zip(col_name, Single_tree_pruned.feature_importances_)) # %% # Post pruning prune_path = Single_tree.cost_complexity_pruning_path(Trained_transformed, target_train) ccp_alphas, impurities = prune_path.ccp_alphas, prune_path.impurities fig, ax = plt.subplots() ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post") ax.set_xlabel("effective alpha") ax.set_ylabel("total impurity of leaves") ax.set_title("Total Impurity vs effective alpha for training set") plt.show() clfs = [] for ccp_alpha in ccp_alphas: clf = DecisionTreeClassifier(criterion='entropy', random_state=1, ccp_alpha=ccp_alpha)