Beispiel #1
0
def decisionTreeExamplePlot():

    depths = np.arange(1, 20)
    train_accuracy = np.empty(len(depths))
    test_accuracy = np.empty(len(depths))

    for i, k in enumerate(depths):
        dt = DecisionTreeClassifier(max_depth=k, random_state=12)
        path = dt.cost_complexity_pruning_path(X_train, Y_train)
        ccp_alphas, impurities = path.ccp_alphas, path.impurities
        dt = DecisionTreeClassifier(ccp_alpha=ccp_alphas[int(
            len(ccp_alphas) * 24 / 25)],
                                    random_state=12)
        dt.fit(X_train, Y_train)
        train_accuracy[i] = dt.score(X_train, Y_train)
        test_accuracy[i] = dt.score(X_test, Y_test)

    plt.plot(depths,
             test_accuracy,
             label='Decision Tree Testing dataset Accuracy')
    plt.plot(depths,
             train_accuracy,
             label='Decision Tree Training dataset Accuracy')

    plt.legend()
    plt.xlabel('Max Depth')
    plt.ylabel('Accuracy')
    plt.show()
Beispiel #2
0
def decision_tree():
    clf = DecisionTreeClassifier(random_state=0)
    path = clf.cost_complexity_pruning_path(X_train, y_train)
    ccp_alphas, impurities = path.ccp_alphas, path.impurities

    clfs = []
    for ccp_alpha in ccp_alphas:
        clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
        clf.fit(X_train, y_train)
        clfs.append(clf)

    clfs = clfs[:-1]
    ccp_alphas = ccp_alphas[:-1]

    train_scores = [clf.score(X_train, y_train) for clf in clfs]
    test_scores = [clf.score(X_test, y_test) for clf in clfs]

    fig, ax = plt.subplots()
    ax.set_xlabel("alpha")
    ax.set_ylabel("accuracy")
    ax.set_title("Accuracy vs alpha for training and testing sets")
    ax.plot(ccp_alphas,
            train_scores,
            marker='o',
            label="train",
            drawstyle="steps-post")
    ax.plot(ccp_alphas,
            test_scores,
            marker='o',
            label="test",
            drawstyle="steps-post")
    ax.legend()
    plt.show()
    def explore_tree_pruning(self):
        labels = self.data[self.target]
        X_train, X_test, y_train, y_test = train_test_split(
            self.data[self.feature_names], labels, stratify=labels)
        tree_prune = DecisionTreeClassifier(random_state=0,
                                            criterion='entropy')
        alphas = tree_prune.cost_complexity_pruning_path(X_train,
                                                         y_train).ccp_alphas
        trees = []
        for alpha in alphas:
            trees.append(
                DecisionTreeClassifier(random_state=0,
                                       criterion='entropy',
                                       ccp_alpha=alpha).fit(X_train, y_train))

        train_scores = [i.score(X_train, y_train) for i in trees]
        test_scores = [i.score(X_test, y_test) for i in trees]

        fig, ax = plt.subplots()
        ax.set_xlabel('alpha')
        ax.set_ylabel('accuracy')
        ax.set_title('accuracy vs. alpha')
        ax.plot(alphas,
                train_scores,
                marker='o',
                label='train',
                drawstyle='steps-post')
        ax.plot(alphas,
                test_scores,
                marker='o',
                label='test',
                drawstyle='steps-post')
        ax.legend()
        plt.show()
Beispiel #4
0
def decision_tree(X_train, X_test, y_train, y_test, criterion, max_features):
    test_scores, training_scores = [], []
    trees, nodes = [], []

    clf = DecisionTreeClassifier(random_state=0)
    path = clf.cost_complexity_pruning_path(X_train, y_train)
    ccp_alphas, impurities = path.ccp_alphas, path.impurities

    for ccp_alpha in ccp_alphas:
        clf = DecisionTreeClassifier(ccp_alpha=ccp_alpha,
                                     criterion=criterion,
                                     max_features=max_features)
        clf = clf.fit(X_train, y_train)

        y_train_prediction = clf.predict(X_train)
        y_test_prediction = clf.predict(X_test)

        if clf.tree_.node_count not in nodes:
            training_scores.append(
                metrics.accuracy_score(y_train, y_train_prediction))
            test_scores.append(
                metrics.accuracy_score(y_test, y_test_prediction))

            trees.append(clf)
            nodes.append(clf.tree_.node_count)

    return max(test_scores)
Beispiel #5
0
def plot_decision_tree_alphas(X, y):
    # split data set into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=667)
    # create decision tree classifier
    tree = DecisionTreeClassifier(random_state=667)
    # do cost complexity pruning
    path = tree.cost_complexity_pruning_path(X_train, y_train)
    alphas = path.ccp_alphas
    alphas = alphas[:-1]
    train_acc, test_acc = [], []
    # create a decision tree for each of our alpha values and store the training and testing accuracies
    for alpha in alphas:
        tree = DecisionTreeClassifier(random_state=667, ccp_alpha=alpha)
        tree = tree.fit(X_train, y_train)
        y_train_pred = tree.predict(X_train)
        y_test_pred = tree.predict(X_test)
        train_acc.append(accuracy_score(y_train, y_train_pred))
        test_acc.append(accuracy_score(y_test, y_test_pred))
    # graphically plot the accuracies of the trees using the training and testing datasets
    fig, ax = plt.subplots()
    ax.set_xlabel('alpha')
    ax.set_ylabel('accuracy')
    ax.set_title('Accuracy vs alpha for training and testing sets')
    ax.plot(alphas,
            train_acc,
            marker='o',
            label='train',
            drawstyle='steps-post')
    ax.plot(alphas, test_acc, marker='o', label='test', drawstyle='steps-post')
    ax.legend()
    plt.show()
Beispiel #6
0
class DecisionTreeModel:
    # initialize a DecisionTreeModel object with "model" attribute containing an actual DecisionTreeClassifier object from the skLearn module
    def __init__(self,*args,**kwargs):
        self.model = DecisionTreeClassifier(*args, **kwargs)

    def get_model(self):
        return self.model

    def apply(self,X,check_input=True):
        if (isinstance(X,TabularData)):
            X=DataConversion.extract_X(X)
        return self.model.apply(X,check_input)

    def cost_complexity_pruning_path(self,X,y,sample_weight=None):
        if (isinstance(X,TabularData)):
            X=DataConversion.extract_X(X)
        if (isinstance(y,TabularData)):
            y=DataConversion.extract_y(y)
        return self.model.cost_complexity_pruning_path(X,y,sample_weight)        
    def decision_path(self,X,check_input=True):
        if (isinstance(X,TabularData)):
            X=DataConversion.extract_X(X)
        return self.model.decision_path(X,check_input)
    
    def fit(self,X,y,sample_weight=None,check_input=True,X_idx_sorted=None):
        if (isinstance(X,TabularData)):
            X=DataConversion.extract_X(X)
        if (isinstance(y,TabularData)):
            y=DataConversion.extract_y(y)
        self.model.fit(X,y,sample_weight,check_input,X_idx_sorted)
        return self

    def predict(self,X,check_input=True):
        if (isinstance(X,TabularData)):
            X=DataConversion.extract_X(X)
        return self.model.predict(X,check_input)

    def predict_log_proba(self,X):
        if (isinstance(X,TabularData)):
            X=DataConversion.extract_X(X)
        return self.model.predict_log_proba(X)

    def predict_proba(self,X,check_input=True):
        if (isinstance(X,TabularData)):
            X=DataConversion.extract_X(X)
        return self.model.predict_proba(X,check_input)

    def score(self,X,y,sample_weight=None):
        if (isinstance(X,TabularData)):
            X=DataConversion.extract_X(X)
        if (isinstance(y,TabularData)):
            y=DataConversion.extract_y(y)
        return self.model.score(X,y,sample_weight)

    def __getattribute__(self,item):
        try:
            return super().__getattribute__(item)
        except:
            pass;
        return getattr(self.model,item)
Beispiel #7
0
def decisionTree(data):
    # find the best alpha for pruning
    X_train, X_test, y_train, y_test = train_test_split(data[0], data[1], test_size=0.5, random_state=0)
    clf = DecisionTreeClassifier(random_state=0)
    path = clf.cost_complexity_pruning_path(X_train, y_train)
    ccp_alphas = path.ccp_alphas[:-2]

    clfs = []
    for ccp_alpha in ccp_alphas:
        clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
        clf.fit(X_train, y_train)
        clfs.append(clf)
    train_scores = [clf.score(X_train, y_train) for clf in clfs]
    test_scores = [clf.score(X_test, y_test) for clf in clfs]

    # plot
    fig, ax = plt.subplots()
    ax.set_xlabel("Alpha")
    ax.set_ylabel("Accuracy")
    ax.set_title("Accuracy vs Alpha")
    ax.plot(ccp_alphas, train_scores, marker='o', label="train")
    ax.plot(ccp_alphas, test_scores, marker='o', label="test")
    ax.legend()
    plt.show()

    # best alpha
    ccp_alpha = ccp_alphas[test_scores.index(max(test_scores))]
    print('Alpha: %2f' % ccp_alpha)

    # train with various training size
    train_size = [0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8]
    train_scores, test_scores = [], []
    for ts in train_size:
        tmp1, tmp2 = [], []
        for state in [0,1,2,3,4,5,6,7,8,9, 10,11, 12, 13, 14, 100, 1000]:
            # split train and test set
            X_train, X_test, y_train, y_test = train_test_split(data[0], data[1], test_size=1-ts, random_state=state)

            # train and evaluate model
            clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
            res_train, res_test = evaluation(clf, X_train, X_test, y_train, y_test)
            tmp1.append(res_train)
            tmp2.append(res_test)

        train_scores.append(sum(tmp1)/len(tmp1))
        test_scores.append(sum(tmp2)/len(tmp2))

    # plot
    fig, ax = plt.subplots()
    ax.set_xlabel("Training size %")
    ax.set_ylabel("Accuracy")
    ax.set_title("Accuracy vs Training size")
    ax.plot(train_size, train_scores, marker='o', label="train")
    ax.plot(train_size, test_scores, marker='o', label="test")
    ax.legend()
    plt.show()

    print(print('Overall accuracy: %2f' % max(test_scores)))
Beispiel #8
0
def calculate():
    df = pd.read_csv('iris.csv')
    df['petal.width'].plot.hist()
    plt.show()
    sns.pairplot(df, hue='species')
    plt.show()
    all_inputs = df[[
        'sepal.length', 'sepal.width', 'petal.length', 'petal.width'
    ]].values
    all_classes = df['species'].values
    (train_inputs, test_inputs, train_classes,
     test_classes) = train_test_split(all_inputs,
                                      all_classes,
                                      train_size=0.7,
                                      random_state=1)
    dtc = DecisionTreeClassifier(random_state=0, ccp_alpha=0.01)
    dtc.fit(train_inputs, train_classes)
    print("Score is " + str(dtc.score(test_inputs, test_classes)))
    clf = SVC(random_state=0)
    clf.fit(train_inputs, train_classes)
    plot_confusion_matrix(clf, test_inputs, test_classes)
    plt.show()
    # from the graphic we can see:
    # setosa - no mistakes
    # versicolor - no mistakes
    # virginica was confused with versicolor 1 time

    # pruning
    path = dtc.cost_complexity_pruning_path(train_inputs, train_classes)
    print(path)
    ccp_alphas, impurities = path.ccp_alphas, path.impurities
    plt.figure(figsize=(10, 6))
    plt.plot(ccp_alphas, impurities)
    plt.xlabel("effective alpha")
    plt.ylabel("")
    plt.show()
    clfs = []
    for ccp_alpha in ccp_alphas:
        dtc = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
        dtc.fit(train_inputs, train_classes)
        clfs.append(dtc)
    acc_scores = [
        accuracy_score(test_classes, clf.predict(test_inputs)) for clf in clfs
    ]
    tree_depths = [clf.tree_.max_depth for clf in clfs]
    plt.figure(figsize=(10, 6))
    plt.grid()
    plt.plot(ccp_alphas[:-1], acc_scores[:-1])
    plt.xlabel("effective alpha")
    plt.ylabel("Accuracy scores")
    plt.show()
def decision_tree_post_pruning(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    clf = DecisionTreeClassifier(random_state=0)
    path = clf.cost_complexity_pruning_path(X_train, y_train)
    ccp_alphas, impurities = path.ccp_alphas, path.impurities

    clfs = []
    for ccp_alpha in ccp_alphas:
        clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
        clf.fit(X_train, y_train)
        clfs.append(clf)
    print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
        clfs[-1].tree_.node_count, ccp_alphas[-1]))

    clfs = clfs[:-1]
    ccp_alphas = ccp_alphas[:-1]

    node_counts = [clf.tree_.node_count for clf in clfs]
    depth = [clf.tree_.max_depth for clf in clfs]
    fig, ax = plt.subplots(2, 1)
    ax[0].plot(ccp_alphas, node_counts, marker='o', drawstyle="steps-post")
    ax[0].set_xlabel("alpha")
    ax[0].set_ylabel("number of nodes")
    ax[0].set_title("Number of nodes vs alpha")
    ax[1].plot(ccp_alphas, depth, marker='o', drawstyle="steps-post")
    ax[1].set_xlabel("alpha")
    ax[1].set_ylabel("depth of tree")
    ax[1].set_title("Depth vs alpha")
    fig.tight_layout()

    train_scores = [clf.score(X_train, y_train) for clf in clfs]
    test_scores = [clf.score(X_test, y_test) for clf in clfs]

    fig, ax = plt.subplots()
    ax.set_xlabel("alpha")
    ax.set_ylabel("accuracy")
    ax.set_title("Accuracy vs alpha for training and testing sets")
    ax.plot(ccp_alphas,
            train_scores,
            marker='o',
            label="train",
            drawstyle="steps-post")
    ax.plot(ccp_alphas,
            test_scores,
            marker='o',
            label="test",
            drawstyle="steps-post")
    ax.legend()
    plt.show()
Beispiel #10
0
def tree(x_train, x_test, y_train, y_test, features, criterion, test_split,
         dataset):

    test_accuracies = []
    training_accuracies = []
    trees = []
    nodes = []

    clf = DecisionTreeClassifier(random_state=0)
    path = clf.cost_complexity_pruning_path(x_train, y_train)
    alphas, impurities = path.ccp_alphas, path.impurities

    for alpha in alphas:

        clf = DecisionTreeClassifier(ccp_alpha=alpha,
                                     criterion=criterion,
                                     max_features=features)
        clf = clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        y_train_pred = clf.predict(x_train)

        if clf.tree_.node_count not in nodes:

            training_accuracies.append(
                metrics.accuracy_score(y_train, y_train_pred))
            test_accuracies.append(metrics.accuracy_score(y_test, y_pred))

            trees.append(clf)
            nodes.append(clf.tree_.node_count)

    # Below this line are the most accurate decision trees.

    # if dataset == "Cleveland" and criterion == 'gini' and features == 5 and test_split == 0.1:
    # 	plt.title("Cleveland Test Data and Training Data Accuracy")
    # 	plt.xlabel("# of Nodes")
    # 	plt.ylabel("Test Data Accuracy (Red) and Training Data Accuracy (Blue)")
    # 	plt.plot(nodes, test_accuracies, 'r')
    # 	plt.plot(nodes, training_accuracies, 'b')
    # 	plt.savefig('ClevelandAccuracies.png')

    # if dataset == "Banknote" and criterion == 'entropy' and features == 1 and test_split == 0.2:
    # 	plt.title("Banknote Test Data and Training Data Accuracy")
    # 	plt.xlabel("# of Nodes")
    # 	plt.ylabel("Test (Green) Accuracy and Training (Yellow) Accuracy")
    # 	plt.plot(nodes, test_accuracies, 'g')
    # 	plt.plot(nodes, training_accuracies, 'y')
    # 	plt.savefig('BanknoteAccuracies.png')

    max_value = max(test_accuracies)
    return max_value
def correlation_with_dependent(X, y, file):
    # TODO find variables that contain all of the mortstat=1 values but reduce the sample size
    # One way run a one-level decision tree for each column in X, and chose the variable and cut point
    # with the lowest class==0 impurity -- i.e. all the class==1 nodes are in one leaf
    """
    cor = X.corrwith(y, axis=0)
    print(type(cor))
    cor = cor.rename_axis('correlation')
    cor = cor.sort_values()
    print(cor)
    """
    'class_weight={0: 0.9, 1: 0.1}, '
    dir = "correlations/"
    tree_clf = DecisionTreeClassifier(
        criterion='gini',
        max_depth=1,
        min_samples_split=2,
        max_features=len(X.columns),
        class_weight={
            0: 0.1,
            1: 0.9
        },
    )
    tree_clf.fit(X, y)

    pred = tree_clf.predict(X)
    # Confusion matrix and classification report
    print("Correlation with dependent tree")
    print(confusion_matrix(y, pred))
    print(classification_report(y, pred))

    y_score = tree_clf.score(X, y)
    print('Accuracy: ', y_score)

    micro_precision = precision_score(pred, y, average='micro')
    print('Micro-averaged precision score: {0:0.2f}'.format(micro_precision))

    macro_precision = precision_score(pred, y, average='macro')
    print('Macro-averaged precision score: {0:0.2f}'.format(macro_precision))

    per_class_precision = precision_score(pred, y, average=None)
    print('Per-class precision score:', per_class_precision)

    path = tree_clf.cost_complexity_pruning_path(X, y)
    ccp_alphas, impurities = path.ccp_alphas, path.impurities
    print(f"Impurities: {path.impurities}")

    create_png(dir + file, X, tree_clf)
Beispiel #12
0
def prune_tree_model(train_features, train_labels, test_features, test_labels):
    clf = DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_split=5, min_samples_leaf=10)
    path = clf.cost_complexity_pruning_path(train_features, train_labels)
    ccp_alphas, impurities = path.ccp_alphas, path.impurities
    clfs = []
    best_clf = []
    best_score = 0.0
    for ccp_alpha in ccp_alphas:
    	clf = DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_split=5, min_samples_leaf=10, random_state=0, ccp_alpha=ccp_alpha)
    	clf.fit(train_features, train_labels)
    	clfs.append(clf)
    	acc_score = clf.score(test_features, test_labels)
    	print("Score: " + str(acc_score))
    	if (acc_score > best_score):
    		best_clf = [clf]
    return best_clf[-1].ccp_alpha
Beispiel #13
0
    def DecisionTreeClassifier(self):
        """
        Find the best model (hyperparameters optimization) in the familiy of decision trees       
		
        Return
        ------
        best_nsc   : best nescience achieved
        best_model : a trained DecisionTreeClassifer
        best_viu   : None, since all the variables are used as input 
        """

        # We restrict ourselves to at least 5 samples per leave,
        # otherwise the algorithm could take too much time to converge,
        # Anyway, the limit of 5 is considered a good practice in ML
        clf  = DecisionTreeClassifier(min_samples_leaf=5)

        # Compute prunning points
        path = clf.cost_complexity_pruning_path(self.X_, self.y_)

        previous_nodes = -1
        best_nsc       = 1
        best_model     = None

        # For every possible prunning point in reverse order
        for ccp_alpha in reversed(path.ccp_alphas):
    
            model = DecisionTreeClassifier(ccp_alpha=ccp_alpha, min_samples_leaf=5, random_state=self.random_state)
            model.fit(self.X_, self.y_)
    
            # Skip evaluation if nothing has changed
            if model.tree_.node_count == previous_nodes:
                continue
    
            previous_nodes = model.tree_.node_count
    
            new_nsc = self.nescience_.nescience(model)
    
            if new_nsc < best_nsc:
                best_nsc   = new_nsc
                best_model = model
            else:
                if self.fast:
                    # Early stop
                    break

        return (best_nsc, best_model, None)
Beispiel #14
0
def post_pruning(X, y):
    """Minimal-complexity post-pruning for large decision trees. Given data set (X,y), train a decision tree classifier
    and compute the ccp_alphas from possible pruning paths. Do cross-validation with 5 folds and use one-standard-error
    rule to get the most parsimonous tree.

        Parameters
        ----------
        X: Input features

        y: Labels

        Returns
        -------
        ccp_alpha: Selected best alpha a*
    """
    # https://medium.com/swlh/post-pruning-decision-trees-using-python-b5d4bcda8e23
    # https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html#sphx-glr-auto-examples-tree-plot-cost-complexity-pruning-py

    clf = DecisionTreeClassifier(random_state=42)
    path = clf.cost_complexity_pruning_path(X, y)
    ccp_alphas, impurities = path.ccp_alphas, path.impurities
    ccp_alphas = ccp_alphas[:-1]
    scores = []
    if len(ccp_alphas) != 0:
        for ccp_alpha in ccp_alphas:
            clf = DecisionTreeClassifier(ccp_alpha=ccp_alpha)
            score = cross_val_score(clf,
                                    X,
                                    y,
                                    cv=5,
                                    scoring="neg_mean_squared_error",
                                    n_jobs=-1)
            scores.append(score)

        # average over folds, fix sign of mse
        fold_mse = -np.mean(scores, 1)
        # select the most parsimonous model (highest ccp_alpha) that has an error within one standard deviation of
        # the minimum mse.
        # I.e. the "one-standard-error" rule (see ESL or a lot of other tibshirani / hastie notes on regularization)
        selected_alpha = np.max(
            ccp_alphas[fold_mse <= np.min(fold_mse) + np.std(fold_mse)])

        return selected_alpha

    else:
        return 0.0
Beispiel #15
0
    def get_cpp_alphas(self, data):
        clf = DecisionTreeClassifier(random_state=0)
        path = clf.cost_complexity_pruning_path(data.X_train, data.y_train)
        path_df = pd.DataFrame(path)
        path_df[:-1].plot(
            x="ccp_alphas",
            y="impurities",
            style=".-",
            legend=False,
            title=f"Impurities vs ccp_alphas on {data.data_name} data")
        plt.ylabel("total impurity of leaves")
        plt.savefig(os.path.join("output", f"{data.data_name}_ccp_alphas.png"))
        plt.clf()
        self.ccp_alphas = np.linspace(
            0, path_df[path_df["impurities"] < 0.5]["ccp_alphas"].iloc[-2],
            100)

        res_dict = {
            "cpp_alphas": [],
            "node_count": [],
            "max_depth": [],
            "n_leaves": [],
        }
        for a in self.ccp_alphas:
            clf = DecisionTreeClassifier(random_state=0, ccp_alpha=a)
            clf.fit(data.X_train, data.y_train)
            res_dict["cpp_alphas"].append(a)
            res_dict["node_count"].append(clf.tree_.node_count)
            res_dict["max_depth"].append(clf.tree_.max_depth)
            res_dict["n_leaves"].append(clf.tree_.n_leaves)
        res_df = pd.DataFrame(res_dict)
        res_df = res_df.set_index("cpp_alphas")
        res_df.plot(
            subplots=True,
            style=".-",
            figsize=(10, 5),
            title=
            f"Decision tree complexity vs ccp_alpha on {data.data_name} data",
        )
        plt.savefig(
            os.path.join("output", f"{data.data_name}_ccp_alphas_tree.png"))
        plt.clf()
def analyze_ccp_alpha(X_train, X_test, y_train, y_test):
    """ Analyses the relationship between the ccp_alphas parameter and accuracy for Decision Trees.
        This function was created to perform analysis on the ccp_alpha parameter to see if tuning it 
        improved the decision tree. It is not used in the normal running of the code as it takes much
        too long to complete. It is present for completeness only.
        Code derived from: https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html

          Args:
            X_train (array): The training documents
            X_test (array): the testing documents
            y_train (array): the training labels
            y_test (array): the testing labels
    """
    classifier = DecisionTreeClassifier(random_state=0)

    path = classifier.cost_complexity_pruning_path(X_train, y_train)
    ccp_alphas, impurities = path.ccp_alphas, path.impurities

    classifiers = []
    for ccp_alpha in ccp_alphas:
        clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
        clf.fit(X_train, y_train)
        classifiers.append(clf)

    train_scores = [clf.score(X_train, y_train) for clf in classifiers]
    test_scores = [clf.score(X_test, y_test) for clf in classifiers]

    fig, ax = plt.subplots()
    ax.set_xlabel("Alpha")
    ax.set_ylabel("Accuracy")
    ax.set_title("Accuracy vs alpha for training and testing sets")
    ax.plot(ccp_alphas, train_scores, marker='o', label="train",
            drawstyle="steps-post")
    ax.plot(ccp_alphas, test_scores, marker='o', label="test",
            drawstyle="steps-post")
    ax.legend()
    plt.show()
Beispiel #17
0
def opt_ad(x_train, y_train, x_valid, y_valid):
    """Obtem o melhor valor de alpha para a poda da AD"""

    ad = AD()

    path = ad.cost_complexity_pruning_path(x_train, y_train)
    alphas = path.ccp_alphas

    res = np.zeros((len(alphas), 1))

    # https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html

    for i, alpha in enumerate(alphas):
        ad = AD(random_state=0, ccp_alpha=alpha)
        ad.fit(x_train, y_train)
        res[i] = math.sqrt(sk.metrics.mean_squared_error(ad.predict(x_valid),
                                                         y_valid))

    i = res.argmin()

    print(f'SVM = {res[i]}')
    print(f':: alpha = {alphas[i]}')

    return res[i]
Beispiel #18
0
# %%
# Total impurity of leaves vs effective alphas of pruned tree
# ---------------------------------------------------------------
# Minimal cost complexity pruning recursively finds the node with the "weakest
# link". The weakest link is characterized by an effective alpha, where the
# nodes with the smallest effective alpha are pruned first. To get an idea of
# what values of ``ccp_alpha`` could be appropriate, scikit-learn provides
# :func:`DecisionTreeClassifier.cost_complexity_pruning_path` that returns the
# effective alphas and the corresponding total leaf impurities at each step of
# the pruning process. As alpha increases, more of the tree is pruned, which
# increases the total impurity of its leaves.
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

clf = DecisionTreeClassifier(random_state=0)
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

# %%
# In the following plot, the maximum effective alpha value is removed, because
# it is the trivial tree with only one node.
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")

# %%
# Next, we train a decision tree using the effective alphas. The last value
# in ``ccp_alphas`` is the alpha value that prunes the whole tree,
# leaving the tree, ``clfs[-1]``, with one node.
Beispiel #19
0
def decision_tree_experiment(dataset, hparams, output_fn_base):
    logs = []
    metrics_dictionary = {}
    print("----Running Decision Tree Experiment-----")
    print("Hyperparameters Used: ")
    print(hparams)
    X = dataset["features"]
    y = dataset["class"]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        shuffle=True,
                                                        test_size=.2,
                                                        train_size=.8)

    # tree_clf = DecisionTreeClassifier()

    # # # grid search (comment out in production)
    # parameter_space = {
    #     'max_depth': list(range(hparams["known_max_depth"]+2)[1:]),
    #     "ccp_alpha": [.001,.1, .2,.3,.4,.5,.6,.7,.8,.9, 1, 1.1, 1.2, 1.3, 1.4, 1.5],
    #     'criterion': ['gini', 'entropy']
    # }
    # baseclf = DecisionTreeClassifier()
    # clf = GridSearchCV(baseclf, parameter_space, n_jobs=1, cv=3)
    # clf.fit(X_train, y_train)
    # print('Best parameters found:\n', clf.best_params_)

    tree_clf = DecisionTreeClassifier(max_depth=hparams["max_depth"],
                                      splitter=hparams["splitter"],
                                      criterion=hparams["criterion"],
                                      ccp_alpha=hparams["ccp_alpha"])

    # learning curve
    cv = ShuffleSplit(n_splits=100, test_size=0.2)
    train_scores, train_sizes, validation_scores, fit_times = plot_learning_curve2(
        tree_clf,
        "%s/DT/Curve_%s" % (output_fn_base, output_fn_base),
        "Unboosted DT Learning Curve for %s" % output_fn_base,
        X,
        y,
        ylim=None,
        cv=cv,
        n_jobs=4)
    metrics_dictionary["train_scores"] = train_scores
    metrics_dictionary["train_sizes"] = train_sizes
    metrics_dictionary["validation_scores"] = validation_scores
    metrics_dictionary["fit_times"] = fit_times

    tree_clf.fit(X=X_train, y=y_train)
    runtimes = get_runtime_avgs(tree_clf, X_test)
    # confusion matrix
    matrix = create_confusion_matrix(
        tree_clf, X, y, 10,
        "%s/DT/Confusion_%s" % (output_fn_base, output_fn_base))

    # let's show the actual tree
    export_graphviz(tree_clf,
                    out_file=("%s/DT/%s.dot" %
                              (output_fn_base, output_fn_base)),
                    feature_names=dataset["features"].columns,
                    rounded=True,
                    filled=True)
    (graph, ) = pydot.graph_from_dot_file('%s/DT/%s.dot' %
                                          (output_fn_base, output_fn_base))
    graph.write_png('%s/DT/%s.png' % (output_fn_base, output_fn_base))

    # classification report:
    logs.append(classification_report(y_test, tree_clf.predict(X_test)))

    # cross validate:
    cvd = cross_validate(
        tree_clf,
        X.values,
        y.values,
        cv=10,
        scoring={
            'accuracy:': make_scorer(accuracy_score),
            'precision': make_scorer(precision_score, average='micro'),
            'recall': make_scorer(recall_score, average='micro'),
            'f1_score': make_scorer(f1_score, average='micro')
        },
    )
    metrics_dictionary["cvd"] = cvd

    # now run experiments for the same metric but relative to hyperparameters
    # pruning is done with min_samples leaf, max_depth, ccp_alpha
    # info on ccp_alpha: https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html

    ##################################
    # CCP_ALPHA

    path = tree_clf.cost_complexity_pruning_path(X_train, y_train)
    ccp_alphas, impurities = path.ccp_alphas, path.impurities
    ccp_alphas = list(ccp_alphas)
    ccp_alphas.sort()
    clfs = []
    for ccp_alpha in ccp_alphas:
        clf = DecisionTreeClassifier(ccp_alpha=ccp_alpha)
        clf.fit(X_train, y_train)
        clfs.append(clf)
    print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
        clfs[-1].tree_.node_count, ccp_alphas[-1]))

    train_scores = [clf.score(X_train, y_train) for clf in clfs]
    test_scores = [clf.score(X_test, y_test) for clf in clfs]

    fig, ax = plt.subplots()
    ax.plot(ccp_alphas[:-1], impurities[:-1])
    ax.set_xlabel("effective alpha")
    ax.set_ylabel("total impurity of leaves")
    ax.set_title("Total Impurity vs effective alpha for training set")
    plt.savefig("%s/DT/IMPURITYVALPHA_%s" % (output_fn_base, output_fn_base))
    plt.close()

    fig, ax = plt.subplots()
    ax.set_xlabel("alpha")
    ax.set_ylabel("accuracy")
    ax.set_title("Alpha Impact on Score - DT For %s" % output_fn_base)
    ax.plot(ccp_alphas, train_scores, label="In-Sample")
    ax.plot(ccp_alphas, test_scores, label="Out-Of-Sample")
    ax.legend()
    plt.savefig("%s/DT/CCPALPHA_%s" % (output_fn_base, output_fn_base))
    plt.close()

    ##################################
    # Max_depth

    max_depths = list(range(hparams["known_max_depth"] + 2)[1:])
    clfs = []
    for depth in max_depths:
        clf = DecisionTreeClassifier(max_depth=depth)
        clf.fit(X_train, y_train)
        clfs.append(clf)
    print("Number of nodes in the last tree is: {} with max_depth: {}".format(
        clfs[-1].tree_.node_count, max_depths[-1]))

    train_scores = [clf.score(X_train, y_train) for clf in clfs]
    test_scores = [clf.score(X_test, y_test) for clf in clfs]

    fig, ax = plt.subplots()
    ax.set_xlabel("max_depth")
    ax.set_ylabel("accuracy")
    ax.set_title("Max Depth Impact on Score - DT For %s" % output_fn_base)
    ax.plot(max_depths, train_scores, label="In-Sample")
    ax.plot(max_depths, test_scores, label="Out-Of-Sample")
    ax.legend()
    plt.savefig("%s/DT/DEPTH_%s" % (output_fn_base, output_fn_base))
    plt.close()

    ##################################
    # min_samples_leaf

    min_samples_leaf_params = list(
        range(1, hparams["known_max_min_leaf"], 1)[1:])
    clfs = []
    for leaf_size in min_samples_leaf_params:
        clf = DecisionTreeClassifier(min_samples_leaf=leaf_size)
        clf.fit(X_train, y_train)
        clfs.append(clf)
    print("Number of nodes in the last tree is: {} with min_samples_leaf: {}".
          format(clfs[-1].tree_.node_count, min_samples_leaf_params[-1]))

    train_scores = [clf.score(X_train, y_train) for clf in clfs]
    test_scores = [clf.score(X_test, y_test) for clf in clfs]

    fig, ax = plt.subplots()
    ax.set_xlabel("min_samples_leaf")
    ax.set_ylabel("accuracy")
    ax.set_title("Min Leaf Samples Impact on Score - DT For %s" %
                 output_fn_base)
    ax.plot(min_samples_leaf_params, train_scores, label="In-Sample")
    ax.plot(min_samples_leaf_params, test_scores, label="Out-Of-Sample")
    ax.legend()
    plt.savefig("%s/DT/LEAF_%s" % (output_fn_base, output_fn_base))
    plt.close()

    logs.append("\tHyperparameters: \n")
    logs.append("\t%s" % str(hparams))
    logs.append("\n\n\tConfusion Matrix: \n")
    logs.append("\t%s\n" % str(matrix).replace("\n", "\n\t"))
    logs.append("\n\tMean Accuracy %.05f" % cvd["test_precision"].mean())
    logs.append("\n\tMean Precision Score of positive examples %.05f" %
                cvd["test_precision"].mean())
    logs.append("\n\tMean Recall Score of positive examples %.05f" %
                cvd["test_recall"].mean())
    logs.append("\n\tF1 Score of positive examples %.05f\n" %
                cvd["test_f1_score"].mean())
    logs.append("\n\tMean Query Time %.05f\n" % runtimes)

    return logs, metrics_dictionary
Beispiel #20
0
def get_best_tree(data, criterion, name="", graph=False):

    x_train, y_train = data["training_data"], data["training_target"]
    x_test, y_test = data["test_data"], data["test_target"]

    clf = DecisionTreeClassifier(random_state=0, criterion=criterion)
    path = clf.cost_complexity_pruning_path(x_train, y_train)
    ccp_alphas, impurities = path.ccp_alphas, path.impurities

    clfs = []
    for ccp_alpha in ccp_alphas:
        clf = DecisionTreeClassifier(random_state=0,
                                     criterion=criterion,
                                     ccp_alpha=ccp_alpha)
        clf.fit(x_train, y_train)
        clfs.append(clf)

    # Remove tree with only one node
    clfs = clfs[:-1]
    ccp_alphas = ccp_alphas[:-1]

    train_scores = [clf.score(x_train, y_train) for clf in clfs]
    test_scores = [clf.score(x_test, y_test) for clf in clfs]
    node_counts = [clf.tree_.node_count for clf in clfs]
    depth = [clf.tree_.max_depth for clf in clfs]

    if graph:
        # Graph num nodes and depth vs alpha
        fig, ax = plt.subplots(2, 1)
        ax[0].plot(ccp_alphas, node_counts, marker='o', drawstyle="steps-post")
        ax[0].set_xlabel("alpha")
        ax[0].set_ylabel("number of nodes")
        ax[0].set_title("Number of nodes vs alpha")
        ax[1].plot(ccp_alphas, depth, marker='o', drawstyle="steps-post")
        ax[1].set_xlabel("alpha")
        ax[1].set_ylabel("depth of tree")
        ax[1].set_title("Depth vs alpha")
        fig.tight_layout()

        plot_name = "images/plot0_" + criterion + ".png" if name == "" else "images/" + name + "_plot0_" + criterion + ".png"
        plt.savefig(plot_name)

        # Graph accuracy vs alpha
        fig, ax = plt.subplots()
        ax.set_xlabel("alpha")
        ax.set_ylabel("accuracy")
        ax.set_title("Accuracy vs alpha for training and testing sets")
        ax.plot(ccp_alphas,
                train_scores,
                marker='o',
                label="train",
                drawstyle="steps-post")
        ax.plot(ccp_alphas,
                test_scores,
                marker='o',
                label="test",
                drawstyle="steps-post")
        ax.legend()

        plot_name = "images/plot1_" + criterion + ".png" if name == "" else "images/" + name + "_plot1_" + criterion + ".png"
        plt.savefig(plot_name)

    i = test_scores.index(max(test_scores))
    return clfs[i], train_scores[i], test_scores[i], node_counts[i]
Beispiel #21
0
# in this case, true positive = 74% and true negative is 79%. looking at the
# tree itself, it seems that the tree is very complex and might have overfit
# the data. in general, decision trees are prone to overfitting because of the
# large number of parameters in the tree model

# we can prune the tree to minimize overfitting and optimize the model
# 1. Cost Complexity Pruning
# 2. Cross Validation

############### Model Optimization: Cost Complexity Pruning ##################

# we need to find the right value of the puring parameter, alpha. we can plot
# alpha as a function of the accuracy of the tree for both the training and
# testing data to find the optimum alpha.

path = clf_dt.cost_complexity_pruning_path(X_train,
                                           y_train)  # etermine value for alpha
ccp_alphas = path.ccp_alphas  # extract different values for alphas
ccp_alphas = ccp_alphas[:
                        -1]  # exclude the maximum value for alpha. it corresponds to the root node

clf_dts = []  # create an empty array to put decission trees

# create a decision tree for each value of alpha and store it in clf_dts
for ccp_alpha in ccp_alphas:
    clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf_dt.fit(X_train, y_train)
    clf_dts.append(clf_dt)

# plotting the accuracy as a function of alpha for the training set and the
# testing set
train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
Beispiel #22
0
def main():
    username = input("Enter your username\n")
    password = input("Enter your password\n")
    cnx = mysql.connector.connect(username=username,
                                  password=password,
                                  host='localhost',
                                  database='internet_traffic')
    cursor = cnx.cursor(dictionary=True)
    print("Fetching data from database...")
    #query the data we want from the database
    queryString = "select iat_mean, fwd_packets, bwd_packets, duration, label, bytes_per_second, syn_flag_count, rst_flag_count, psh_flag_count, ack_flag_count, urg_flag_count, cwe_flag_count, ece_flag_count, active_time_mean, idle_time_mean  from (((((flow inner join flowbytes on flow.id = flowbytes.flow_id) inner join flowflags on flow.id = flowflags.flow_id) inner join flowiat on flow.id = flowiat.flow_id) inner join flowinfo on flow.id = flowinfo.flow_id) inner join flowpackets on flow.id = flowpackets.flow_id) inner join protocol on flow.protocol_id = protocol.id"
    cursor.execute(queryString)
    rows = []
    for i in cursor:
        rows.append(i)
    with open('mining.csv', 'w', newline='') as f:
        fieldnames = []
        for i in cursor.column_names:
            fieldnames.append(i)
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)
    print("Data successfully fetched and recorded in csv file...")
    #import the data
    dataframe = pd.read_csv("mining.csv", header=0)
    #used to check if the dataframe loaded the data properly
    dataframe.columns = [
        'IATMean', 'ForwardPackets', 'BackwardPackets', 'Duration', 'Label',
        'BytesPerSecond', 'SYNFlagCount', 'RSTFlagCount', 'PSHFlagCount',
        'ACKFlagCount', 'URGFlagCount', 'CWEFlagCount', 'ECEFlagCount',
        'ActiveTimeMean', 'IdleTimeMean'
    ]

    #display the data types
    print(dataframe.head())
    print(dataframe.dtypes)

    #print unique values for each column
    for columnName in dataframe.columns:
        print(columnName + ":")
        print(dataframe[columnName].unique())
        dataframe = dataframe.fillna({columnName: -1})

    #split dataframe into independent and dependent
    X = dataframe.drop('Label', axis=1).copy()
    y = dataframe['Label'].copy()

    #build the preliminary clasification tree
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    clf = DecisionTreeClassifier(random_state=42, max_depth=5)
    clf = clf.fit(X_train, y_train)
    # plot the preliminary tree
    dot_data = StringIO()
    export_graphviz(clf,
                    filled=True,
                    rounded=True,
                    special_characters=True,
                    feature_names=X.columns,
                    class_names=['BENIGN', 'DDoS'],
                    out_file=dot_data)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_png('preliminary.png')
    Image(graph.create_png())

    #create the confusion matrix for the preliminary decision tree
    disp = plot_confusion_matrix(clf,
                                 X_test,
                                 y_test,
                                 display_labels=["BENIGN", "DDoS"])
    plt.show()

    #cost complexity pruning
    #goal is to find the best pruning parameter alpha which controls how much pruning happens
    path = clf.cost_complexity_pruning_path(X_train, y_train)
    ccp_alphas = path.ccp_alphas
    ccp_alphas = ccp_alphas[:-1]

    clfs = []  #we put decisions trees into here

    print("Cost Complexity Pruning")
    for ccp_alpha in ccp_alphas:
        print("make tree for alpha")
        clf = DecisionTreeClassifier(random_state=0,
                                     ccp_alpha=ccp_alpha,
                                     max_depth=5)
        clf = clf.fit(X_train, y_train)
        clfs.append(clf)

    train_scores = [clf.score(X_train, y_train) for clf in clfs]
    test_scores = [clf.score(X_test, y_test) for clf in clfs]

    fig, ax = plt.subplots()
    ax.set_xlabel("alpha")
    ax.set_ylabel("Accuracy")
    ax.set_title("Accuracy vs alpha for training and testing sets")
    ax.plot(ccp_alphas,
            train_scores,
            marker='o',
            label="train",
            drawstyle="steps-post")
    ax.plot(ccp_alphas,
            test_scores,
            marker='o',
            label="test",
            drawstyle="steps-post")
    ax.legend()
    plt.show()

    #there could have been many ways we divide the training and testing dataset
    #we use 10-fold cross validation to see if we used the best training and testing dataset
    #i.e one set of data may have a different optimal alpha

    #demonstrate using a single alpha with different data sets
    #we see that this alpha is sensitive to the datasets
    print("Cross validation")
    clf = DecisionTreeClassifier(random_state=42,
                                 ccp_alpha=0.000005,
                                 max_depth=5)
    scores = cross_val_score(clf, X_train, y_train, cv=10)
    df = pd.DataFrame(data={'tree': range(10), 'accuracy': scores})
    df.plot(x='tree', y='accuracy', marker='o', linestyle='--')
    plt.show()

    #use cross validation to find optimal value for ccp_alpha
    alpha_loop_values = []

    print("10-fold for more than one alpha")
    #for each alpha candidate, we run a 10-fold cross validation
    for ccp_alpha in ccp_alphas:
        clf = DecisionTreeClassifier(random_state=0,
                                     ccp_alpha=ccp_alpha,
                                     max_depth=5)
        scores = cross_val_score(clf, X_train, y_train, cv=10)
        alpha_loop_values.append([ccp_alpha, np.mean(scores), np.std(scores)])
        print("Finished one alpha candidate")

    #graph the mean and standard deviation of the scores for each candidate alpha
    alpha_results = pd.DataFrame(alpha_loop_values,
                                 columns=['alpha', 'mean_accuracy', 'std'])

    alpha_results.plot(x='alpha',
                       y='mean_accuracy',
                       yerr='std',
                       marker='o',
                       linestyle='--')
    plt.show()

    #this part is used to find the exact optimal alpha value used to create the optimal pruned classification tree
    print("optimal alpha value")
    optimal_alpha = alpha_results[(alpha_results['alpha'] > 0)
                                  & (alpha_results['alpha'] < 0.0001)]
    print(optimal_alpha)

    #optimal pruned tree
    clf = DecisionTreeClassifier(random_state=42,
                                 ccp_alpha=2.247936 * (10**(-10)),
                                 max_depth=5)
    clf = clf.fit(X_train, y_train)
    dot_data = StringIO()
    export_graphviz(clf,
                    filled=True,
                    rounded=True,
                    special_characters=True,
                    feature_names=X.columns,
                    class_names=['BENIGN', 'DDoS'],
                    out_file=dot_data)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_png('best.png')
    Image(graph.create_png())

    #draw a confusion matrix for the optimal pruned tree
    disp = plot_confusion_matrix(clf,
                                 X_test,
                                 y_test,
                                 display_labels=["BENIGN", "DDoS"])
    print(disp)
    plt.show()
Beispiel #23
0
class DTLearner(object):
    def __init__(self, leaf_size=1, n_folds=10, verbose=False):
        self.leaf_size = leaf_size
        self.n_folds = n_folds
        self.cv_scores = []
        self.clf = DecisionTreeClassifier()
        self.predictions = []
        self.accuracy_score = 0.0
        self.verbose = verbose

        # NOTE: Add alpha to param_dict, figure out wtf it is
        self.param_dict = {
            "criterion": ['gini', 'entropy'],
            "ccp_alpha": [0, 0.0002, 0.0004, 0.0006, 0.0008, 0.001],
            "max_depth": range(1, 25),
            "min_samples_split": range(2, 5),
            "min_samples_leaf": range(1, 5)
        }
        self.grid = 0

        # Write data to file for easy analysis
        self.f = open("dt_info.txt", "a")
        self.f.write("\n")
        self.f.write(str(datetime.now()))

    def train(self, X_train, y_train, flag):
        '''

        :param X_train: training data
        :param y_train: training labels
        :return:
        '''

        if self.verbose:
            print("Training Decision Tree Model...")
            self.f.write("Training Decision Tree Model...")

        if flag == 0:

            # Use cost_complexity_pruning_path to get effective alphas (these is just what values of ccp_alpha could be appropriate)
            # We can also get corresponding leaf impurities if desired (not needed for now)
            # https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html
            path = self.clf.cost_complexity_pruning_path(X_train, y_train)
            ccp_alphas, impurities = path.ccp_alphas, path.impurities
            ccp_alphas = ccp_alphas[0::5]

            clfs = []
            for ccp_alpha in ccp_alphas:
                clf = DecisionTreeClassifier(random_state=0,
                                             ccp_alpha=ccp_alpha)
                clf.fit(X_train, y_train)
                clfs.append(clf)
            # print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
            #     clfs[-1].tree_.node_count, ccp_alphas[-1]))
            clfs = clfs[:-1]
            ccp_alphas = ccp_alphas[:-1]

            return clfs, ccp_alphas

        if flag == 1:

            possible_depths = range(1, 25)

            clfs = []
            for depth in possible_depths:
                clf = DecisionTreeClassifier(random_state=0, max_depth=depth)
                clf.fit(X_train, y_train)
                clfs.append(clf)

            return clfs, possible_depths

        if flag == 2:

            possible_min_samples_leaf = range(1, 20)

            clfs = []
            for min_samples_leaf in possible_min_samples_leaf:
                clf = DecisionTreeClassifier(random_state=0,
                                             min_samples_leaf=min_samples_leaf)
                clf.fit(X_train, y_train)
                clfs.append(clf)

            return clfs, possible_min_samples_leaf

        if flag == 3:

            possible_min_samples_split = range(2, 20)

            clfs = []
            for min_samples_split in possible_min_samples_split:
                clf = DecisionTreeClassifier(
                    random_state=0, min_samples_split=min_samples_split)
                clf.fit(X_train, y_train)
                clfs.append(clf)

            return clfs, possible_min_samples_split

    def test(self, X_test, X_train, y_test, y_train, clfs, alphas, depths,
             min_samples_leafs, min_samples_splits, flag):
        '''

        :param X_test: test data
        :param y_test: test labels
        :return:
        '''

        if self.verbose:
            print("Testing Decision Tree Model...")

        if flag == 0:
            self.accuracy_score_train = []
            self.accuracy_score_test = []

            for clf in clfs:
                predictions_train = clf.predict(X_train)
                predictions_test = clf.predict(X_test)

                self.accuracy_score_train.append(
                    accuracy_score(y_train, predictions_train))
                self.accuracy_score_test.append(
                    accuracy_score(y_test, predictions_test))

            # Print out best Accuracy/Alpha combination
            print("Best Accuracy Score (Test Validation Set): ",
                  max(self.accuracy_score_test))
            print(
                "Best Alpha (Highest Accuracy, Test Validation Set): ",
                alphas[self.accuracy_score_test.index(
                    max(self.accuracy_score_test))])
            self.f.write("Best Accuracy Score (Test Validation Set): " +
                         str(max(self.accuracy_score_test)) + "\n")
            self.f.write(
                "Best Alpha (Highest Accuracy, Test Validation Set): " +
                str(alphas[self.accuracy_score_test.index(
                    max(self.accuracy_score_test))]) + "\n")

            plt.figure()
            plt.plot(alphas,
                     self.accuracy_score_train,
                     label='Accuracy Score (Training Validation Set)')
            plt.plot(alphas,
                     self.accuracy_score_test,
                     label='Accuracy Score (Test Validation Set)')
            plt.xlabel('Alpha')
            plt.ylabel('Accuracy')
            plt.title('Accuracy vs Alpha Value')
            plt.legend()
            plt.savefig(
                '/Users/ajinkya.bagde/Desktop/AS1_Figs/DT/alpha_vs_accuracy.png'
            )

            return clfs[self.accuracy_score_test.index(
                max(self.accuracy_score_test))]

        if flag == 1:
            self.accuracy_score_train = []
            self.accuracy_score_test = []

            for clf in clfs:
                predictions_train = clf.predict(X_train)
                predictions_test = clf.predict(X_test)

                self.accuracy_score_train.append(
                    accuracy_score(y_train, predictions_train))
                self.accuracy_score_test.append(
                    accuracy_score(y_test, predictions_test))

            # Print out best Accuracy/Depth combination
            print("Best Accuracy Score (Test Validation Set): ",
                  max(self.accuracy_score_test))
            print(
                "Best Depth (Highest Accuracy, Test Validation Set): ",
                depths[self.accuracy_score_test.index(
                    max(self.accuracy_score_test))])
            self.f.write("Best Accuracy Score (Test Validation Set): " +
                         str(max(self.accuracy_score_test)) + "\n")
            self.f.write(
                "Best Depth (Highest Accuracy, Test Validation Set): " +
                str(depths[self.accuracy_score_test.index(
                    max(self.accuracy_score_test))]) + "\n")

            plt.figure()
            plt.plot(depths,
                     self.accuracy_score_train,
                     label='Accuracy Score (Training Validation Set)')
            plt.plot(depths,
                     self.accuracy_score_test,
                     label='Accuracy Score (Test Validation Set)')
            plt.xlabel('Depth')
            plt.ylabel('Accuracy')
            plt.title('Accuracy vs Depth Value')
            plt.legend()
            plt.savefig(
                '/Users/ajinkya.bagde/Desktop/AS1_Figs/DT/depth_vs_accuracy.png'
            )

            return clfs[self.accuracy_score_test.index(
                max(self.accuracy_score_test))]

        if flag == 2:
            self.accuracy_score_train = []
            self.accuracy_score_test = []

            for clf in clfs:
                predictions_train = clf.predict(X_train)
                predictions_test = clf.predict(X_test)

                self.accuracy_score_train.append(
                    accuracy_score(y_train, predictions_train))
                self.accuracy_score_test.append(
                    accuracy_score(y_test, predictions_test))

            # Print out best Accuracy/Depth combination
            print("Best Accuracy Score (Test Validation Set): ",
                  max(self.accuracy_score_test))
            print(
                "Best min_sample_leaf (Highest Accuracy, Test Validation Set): ",
                min_samples_leafs[self.accuracy_score_test.index(
                    max(self.accuracy_score_test))])
            self.f.write("Best Accuracy Score (Test Validation Set): " +
                         str(max(self.accuracy_score_test)) + "\n")
            self.f.write(
                "Best min_sample_leaf (Highest Accuracy, Test Validation Set): "
                + str(min_samples_leafs[self.accuracy_score_test.index(
                    max(self.accuracy_score_test))]) + "\n")

            plt.figure()
            plt.plot(min_samples_leafs,
                     self.accuracy_score_train,
                     label='Accuracy Score (Training Validation Set)')
            plt.plot(min_samples_leafs,
                     self.accuracy_score_test,
                     label='Accuracy Score (Test Validation Set)')
            plt.xlabel('min_sample_leaf')
            plt.ylabel('Accuracy')
            plt.title('Accuracy vs min_sample_leaf Value')
            plt.legend()
            plt.savefig(
                '/Users/ajinkya.bagde/Desktop/AS1_Figs/DT/minsampleleaf_vs_accuracy.png'
            )

            return clfs[self.accuracy_score_test.index(
                max(self.accuracy_score_test))]

        if flag == 3:
            self.accuracy_score_train = []
            self.accuracy_score_test = []

            for clf in clfs:
                predictions_train = clf.predict(X_train)
                predictions_test = clf.predict(X_test)

                self.accuracy_score_train.append(
                    accuracy_score(y_train, predictions_train))
                self.accuracy_score_test.append(
                    accuracy_score(y_test, predictions_test))

            # Print out best Accuracy/Depth combination
            print("Best Accuracy Score (Test Validation Set): ",
                  max(self.accuracy_score_test))
            print(
                "Best min_sample_split (Highest Accuracy, Test Validation Set): ",
                min_samples_splits[self.accuracy_score_test.index(
                    max(self.accuracy_score_test))])
            self.f.write("Best Accuracy Score (Test Validation Set): " +
                         str(max(self.accuracy_score_test)) + "\n")
            self.f.write(
                "Best min_sample_split (Highest Accuracy, Test Validation Set): "
                + str(min_samples_splits[self.accuracy_score_test.index(
                    max(self.accuracy_score_test))]) + "\n")

            plt.figure()
            plt.plot(min_samples_splits,
                     self.accuracy_score_train,
                     label='Accuracy Score (Training Validation Set)')
            plt.plot(min_samples_splits,
                     self.accuracy_score_test,
                     label='Accuracy Score (Test Validation Set)')
            plt.xlabel('min_sample_split')
            plt.ylabel('Accuracy')
            plt.title('Accuracy vs min_sample_split Value')
            plt.legend()
            plt.savefig(
                '/Users/ajinkya.bagde/Desktop/AS1_Figs/DT/minsamplesplit_vs_accuracy.png'
            )

            return clfs[self.accuracy_score_test.index(
                max(self.accuracy_score_test))]

    def tune_hyperparameters(self, final_dt, xtrain, ytrain):
        self.grid = GridSearchCV(final_dt,
                                 param_grid=self.param_dict,
                                 cv=self.n_folds,
                                 verbose=1,
                                 n_jobs=-1)
        self.grid.fit(xtrain, ytrain)

        self.f.write("Best Params from GridSearchCV: " +
                     str(self.grid.best_params_))
        return self.grid.best_params_

    def final_test(self, clf, xtest, ytest):
        prediction_test = clf.predict(xtest)
        print(accuracy_score(ytest, prediction_test))
        self.f.write("Final Accuracy Score (Test Set): " +
                     str(accuracy_score(ytest, prediction_test)))
        self.f.close()
plot_tree(clsf_des_tree,
          filled=True,
          rounded=True,
          class_names=["No HD", "Yes HD"],
          feature_names=X_encoded.columns)

## Plot confusion matrix

plot_confusion_matrix(clsf_des_tree,
                      X_test,
                      y_test,
                      display_labels=["No HD", "Yes HD"])

## pruning the tree to fix over fitting issue

path = clsf_des_tree.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas  # extract different value for alpha
ccp_alphas = ccp_alphas[:-1]  # exclude the maximum value of alpha

clf_dts = []

# Now create one decision tree per value for alpha and store it in the array

for ccp_alpha in ccp_alphas:
    clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf_dt.fit(X_train, y_train)
    clf_dts.append(clf_dt)

train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]
Beispiel #25
0
##############################################################################
############### further steps if using Decision Tree Classifier  #############

from sklearn.tree import plot_tree

# (training set)
plt.figure(figsize=(15, 7.5))
plot_tree(dtc,filled = True,
            rounded = True,
            class_names = ['No heart failure','With heart failure'],
            feature_names = X_train.columns)

##############################################################################
# Cost Complexity Pruning - CCP - to avoid overfitting

dtc.cost_complexity_pruning_path(X_train, y_train)

# remove the impurities
pruned = dtc.cost_complexity_pruning_path(X_train, y_train)  # values of alpha
ccp_alphas = pruned.ccp_alphas  # extract different values for alpha
ccp_alphas = ccp_alphas[:-1]    # excluding max value of alpha (last value)
ccp_alphas
'''
create one decision tree for each value of alpha and store it in the array dtcs
'''
dtcs = [] 
for ccp_alpha in ccp_alphas:
    dtc = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    dtc.fit(X_train, y_train)
    dtcs.append(dtc)
pruned.ccp_alphas
Beispiel #26
0
def Alphatest():
    col_names = ['residue', 'predus', 'ispred', 'dockpred', 'annotated']
    # load dataset
    df = pd.read_csv(
        "/Users/evanedelstein/Desktop/Research_Evan/Raji_Summer2019_atom/Data_Files/Logistic_regresion_corrected/noxdata.csv",
        header=None,
        names=col_names)
    #remove null data
    df.isnull().any()
    data = df.fillna(method='ffill')
    # define dependent var columns
    feature_cols = ['predus', 'ispred', 'dockpred']
    protein = data.residue
    X = data[feature_cols]  # Features
    y = data.annotated  # Target variable
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=1)
    model = DecisionTreeClassifier(random_state=1).fit(X_train, y_train)
    y_predicted = model.predict(X_test)
    print('Training accuracy: ', model.score(X_train, y_train))
    print('Test Accuracy: ', model.score(X_test, y_test))
    path = model.cost_complexity_pruning_path(X_train, y_train)
    ccp_alphas, impurities = path.ccp_alphas, path.impurities
    # fig, ax = plt.subplots()
    # ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
    # ax.set_xlabel("effective alpha")
    # ax.set_ylabel("total impurity of leaves")
    # ax.set_title("Total Impurity vs effective alpha for training set")
    # # plt.show()
    clfs = []
    for ccp_alpha in ccp_alphas:
        model = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
        model.fit(X_train, y_train)
        clfs.append(model)
    print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
        clfs[-1].tree_.node_count, ccp_alphas[-1]))
    clfs = clfs[:-1]
    ccp_alphas = ccp_alphas[:-1]
    node_counts = [clf.tree_.node_count for clf in clfs]
    depth = [clf.tree_.max_depth for clf in clfs]
    train_scores = [clf.score(X_train, y_train) for clf in clfs]
    test_scores = [clf.score(X_test, y_test) for clf in clfs]
    fig, ax = plt.subplots()
    ax.set_xlabel("alpha")
    ax.set_ylabel("accuracy")
    ax.set_title("Accuracy vs alpha for training and testing sets")
    ax.plot(ccp_alphas,
            train_scores,
            marker='o',
            label="train",
            drawstyle="steps-post")
    ax.plot(ccp_alphas,
            test_scores,
            marker='o',
            label="test",
            drawstyle="steps-post")
    ax.legend()
    # plt.show()
    index_best_model = np.argmax(test_scores)
    best_model = clfs[index_best_model]
    print('Training accuracy of best model: ',
          best_model.score(X_train, y_train))
    print('Test accuracy of best model: ', best_model.score(X_test, y_test))
    print(best_model.get_params())
    dot_data = tree.export_graphviz(model, out_file=None)
    graph = graphviz.Source(dot_data)
    graph.render("DTtesttree")
    benchmarkna = pd.read_csv(
        '/Users/evanedelstein/Desktop/Research_Evan/Raji_Summer2019_atom/Data_Files/Logistic_regresion_corrected/benchmarkdata.csv',
        header=None,
        names=col_names)
    benchmarkna.isnull().any()
    benchmark = benchmarkna.fillna(method='ffill')
    X_bench = benchmark[feature_cols]
    y_bench = benchmark.annotated
    protienname_bench = benchmark.residue
    y_predict_best = best_model.predict(X_bench)
    print(accuracy_score(y_bench, y_predict_best))
    model2 = tree.DecisionTreeClassifier(ccp_alpha=0.0010540940654176906)
    model2.fit(X, y)
    y_predict = model2.predict(X_bench)
    # Accurecy
    print(accuracy_score(y_bench, y_predict))
Beispiel #27
0
# create dataframe to store the feature name and their scores
# higher score = high significance
impf=pd.DataFrame({'features':trainx.columns,
                   'score':m1.feature_importances_})

# sort the data by scores in decreasing order 
impf.sort_values('score',ascending=False,inplace=True)

# plot the significant features
sns.barplot(x=impf.score,y=impf.features)
plt.title('Decision Tree - Significant Features')
plt.xlabel('Score')
plt.ylabel('Features')

# Decision Tree pruning
dt_path = m1.cost_complexity_pruning_path(trainx,trainy)

# cost complexity parameter values
ccp_alphas = dt_path.ccp_alphas

# find the best ccp_alpha value
results = []
for cp in ccp_alphas:
    m = DecisionTreeClassifier(ccp_alpha=cp).fit(trainx,trainy)
    results.append(m)

# calculate the Accuracy scores for train and test data
trg_score = [r.score(trainx,trainy) for r in results]
test_score = [r.score(testx,testy) for r in results]

# plot the scores
Beispiel #28
0
class DecisionTreeModel(BaseLearner):
    def __init__(self, X_train, X_test, y_train, y_test, pipe,
                 pre_processed_feature_names, class_names, dataset_name):

        self.model = DecisionTreeClassifier(random_state=1)

        super().__init__(X_train, X_test, y_train, y_test, pipe, self.model,
                         pre_processed_feature_names, class_names,
                         dataset_name)

        self.model.fit(self.X_train, self.y_train)
        self.model_params = {'random_state': 1}

    def fit(self):

        super().model.fit(self.X_train, self.X_test)

    def predict(self, y):
        super().model.predict(y)

    '''
    Code adapted from: https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html#sphx-glr-auto-examples-tree-plot-cost-complexity-pruning-py
    '''

    def post_prune(self):

        path = self.model.cost_complexity_pruning_path(self.X_train,
                                                       self.y_train)

        ccp_alphas, impurities = path.ccp_alphas, path.impurities

        title_dic = {'fontsize': 6, 'fontweight': 'bold'}
        fig, ax = plt.subplots()
        ax.plot(ccp_alphas[:-1],
                impurities[:-1],
                marker='o',
                drawstyle="steps-post")
        ax.set_xlabel("effective alpha", title_dic)
        ax.set_ylabel("total impurity of leaves", title_dic)
        ax.set_title("Total Impurity vs effective alpha for training set",
                     title_dic)
        ax.tick_params(axis="x", labelsize=6)
        ax.tick_params(axis="y", labelsize=6)
        ax.yaxis.set_major_formatter(FormatStrFormatter('%.3f'))
        ax.grid()
        plt.tight_layout()
        path = os.path.join(OUTPUT, self.dataset_name, "DecisionTree")
        filename = "Impurities vs Alpha" + "_" + "DecisionTree" + "_" + self.dataset_name + ".png"
        filename = os.path.join(path, filename)
        plt.savefig(filename)

        clfs = []
        for ccp_alpha in ccp_alphas:

            params = self.model_params
            params['ccp_alpha'] = ccp_alpha

            clf = DecisionTreeClassifier(**params)
            clf.fit(self.X_train, self.y_train)
            clfs.append(clf)

        clfs = clfs[:-1]
        ccp_alphas = ccp_alphas[:-1]

        node_counts = [clf.tree_.node_count for clf in clfs]
        depth = [clf.tree_.max_depth for clf in clfs]
        fig, ax = plt.subplots(2, 1)
        ax[0].plot(ccp_alphas, node_counts, marker='o', drawstyle="steps-post")
        ax[0].set_xlabel("alpha", title_dic)
        ax[0].set_ylabel("number of nodes", title_dic)
        ax[0].set_title("Number of nodes vs alpha", title_dic)
        ax[1].plot(ccp_alphas, depth, marker='o', drawstyle="steps-post")
        ax[1].set_xlabel("alpha", title_dic)
        ax[1].set_ylabel("depth of tree", title_dic)
        ax[1].set_title("Depth vs alpha", title_dic)
        ax[0].grid()
        ax[1].grid()
        ax[0].tick_params(axis="x", labelsize=6)
        ax[0].tick_params(axis="y", labelsize=6)
        ax[0].yaxis.set_major_formatter(FormatStrFormatter('%.3f'))
        ax[1].tick_params(axis="x", labelsize=6)
        ax[1].tick_params(axis="y", labelsize=6)
        ax[1].yaxis.set_major_formatter(FormatStrFormatter('%.3f'))
        fig.tight_layout()
        path = os.path.join(OUTPUT, self.dataset_name, "DecisionTree")
        filename = "Nodes and Depth vs Alpha" + "_" + "DecisionTree" + "_" + self.dataset_name + ".png"
        filename = os.path.join(path, filename)
        plt.savefig(filename)

        train_scores = []
        valid_scores = []

        for clf in clfs:
            cv = cross_validate(clf,
                                self.X_train,
                                self.y_train,
                                scoring='f1',
                                return_train_score=True)
            train_scores.append(np.mean(cv['train_score']))
            valid_scores.append(np.mean(cv['test_score']))

        title = "MC Curve for MCC alpha" + "\n" + self.dataset_name
        title_dic = {'fontsize': 6, 'fontweight': 'bold'}
        fig, (ax1), = plt.subplots(1, 1, figsize=(3, 2))
        ax1.set_title(title, title_dic)
        ax1.set_ylabel("Mean F1 Score", title_dic)
        ax1.set_xlabel("alpha", title_dic)
        ax1.tick_params(axis="x", labelsize=6)
        ax1.tick_params(axis="y", labelsize=6)
        ax1.yaxis.set_major_formatter(FormatStrFormatter('%.3f'))

        ax1.plot(ccp_alphas, train_scores, 'r', linewidth=2, label="train")
        ax1.plot(ccp_alphas, valid_scores, 'b', linewidth=2, label="cross val")

        ax1.legend(loc='best', fontsize=6)
        ax1.grid()
        plt.tight_layout()
        path = os.path.join(OUTPUT, self.dataset_name, "DecisionTree")
        filename = "MC_Curve_" + "alpha" + ".png"
        filename = os.path.join(path, filename)
        plt.savefig(filename)

    def update_and_refit_model(self):

        self.model = DecisionTreeClassifier(**self.model_params)
        self.model.fit(self.X_train, self.y_train)

    def export_graph(self, filename):

        self.model.fit(self.X_train, self.y_train)
        plt.figure()

        plot_tree(self.model,
                  feature_names=self.pre_processed_feature_names,
                  class_names=self.class_names,
                  rounded=True,
                  filled=True,
                  fontsize=4)
        plt.savefig(filename + ".eps", format='eps', bbox_inches='tight')
Beispiel #29
0
def dt(d, id=None):
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, shuffle=True, random_state=SEED)
    split = StratifiedShuffleSplit(n_splits=1,
                                   test_size=.33,
                                   random_state=SEED)
    for i, j in split.split(d, d['y']):
        train_set = d.loc[i]
        test_set = d.loc[j]
    y_train, y_test = train_set['y'], test_set['y']
    X_train, X_test = train_set.drop('y', axis=1), test_set.drop('y', axis=1)
    if id == 'E':
        scoring = 'f1'
    else:
        scoring = 'roc_auc'

    model_naive = DecisionTreeClassifier()
    model_naive.fit(X_train, y_train)
    pred = model_naive.predict(X_test)
    cv = StratifiedKFold(n_splits=5, random_state=SEED, shuffle=True)
    print('Accuracy of default model is: ', accuracy_score(y_test, pred))
    print('F1 of default model is: ', f1_score(y_test, pred))
    print('roc_auc of default is: ', roc_auc_score(y_test, pred))
    plot_lc(model_naive,
            'Learning Curve - Default Model',
            X_train,
            y_train,
            cv=cv,
            n_jobs=-1,
            path='figures/DT_{}_Default_learning.png'.format(id))
    plt.clf()
    print('plot saved')

    model = DecisionTreeClassifier(random_state=SEED)
    criterion = ['gini', 'entropy']
    max_depth = np.arange(1, 50)
    ccp_alpha = [.005, .003, .002, .001]
    grid = dict(criterion=criterion, max_depth=max_depth, ccp_alpha=ccp_alpha)
    # cv = KFold(n_splits=3, random_state=SEED, shuffle=True)
    out = GridSearchCV(estimator=model,
                       param_grid=grid,
                       n_jobs=-1,
                       cv=cv,
                       scoring=scoring,
                       error_score=0)
    result = out.fit(X_train, y_train)
    best_model = result.best_estimator_
    best_params = result.best_params_

    print("Best {} {} using params {}".format(scoring, result.best_score_,
                                              best_params))

    # ccpAlphas
    path = model.cost_complexity_pruning_path(X_train, y_train)
    ccp_alphas = path.ccp_alphas[:-1]

    # learning_curve
    plot_lc(best_model,
            'Learning Curve - Best Model',
            X_train,
            y_train,
            cv=cv,
            n_jobs=-1,
            path='figures/DT_{}_Best_learning.png'.format(id))
    plt.clf()

    # validation curve
    param_range = np.linspace(ccp_alphas[0], ccp_alphas[-1], num=20)
    plot_vc(best_model,
            X_train,
            y_train,
            'Validation Curve',
            'ccp_alpha',
            'ccp_alphas',
            param_range,
            scoring,
            cv=cv,
            path='figures/DT_{}_Best_valid.png'.format(id))
    plt.clf()

    plot_roc_curve(best_model, X_test, y_test)
    plt.savefig('figures/DT_{}_ROC.png'.format(id))
    plt.clf()

    best_model.fit(X_train, y_train)
    pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    c_matrix = confusion_matrix(y_test, pred)
    c_report = classification_report(y_test, pred)

    print('Best Accuracy: ', accuracy)
    print('confusion_matrix: \n', c_matrix)
    print('classification_report: \n', c_report)

    plot_confusion_matrix(best_model, X_test, y_test, normalize='pred')
    plt.savefig("figures/DT_{}_Confusion.png".format(id))
    plt.clf()
Beispiel #30
0
                                  out_file=None,
                                  feature_names=col_name,
                                  class_names=['infected', 'noninfected'],
                                  filled=True,
                                  rounded=True,
                                  special_characters=True)
graph = graphviz.Source(graph_data)
graph.format = 'png'
graph.render("COVID_19_DTpruned_py", view=True)

# %%
print(*zip(col_name, Single_tree_pruned.feature_importances_))

# %%
# Post pruning
prune_path = Single_tree.cost_complexity_pruning_path(Trained_transformed,
                                                      target_train)
ccp_alphas, impurities = prune_path.ccp_alphas, prune_path.impurities

fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")

plt.show()

clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(criterion='entropy',
                                 random_state=1,
                                 ccp_alpha=ccp_alpha)