Ejemplo n.º 1
0
help(DecisionTreeClassifier)
'''
criterion : {"gini", "entropy"}, default="gini" 
max_depth : int, default=None # 트리 깊이=> 크면클수록 분류정확도 좋은(but, 오버피팅)
min_samples_split : int or float, default=2 # 가지치기
'''

dtc = DecisionTreeClassifier(criterion='gini', random_state=123, max_depth=3)
model = dtc.fit(x_train, y_train)

# 트리모델 시각화
tree.plot_tree(model)
#@@1

print(export_text(model))
'''
|--- feature_2 <= 2.45  : 3번 칼럼 분류조건(왼쪽노드)
|   |--- class: 0     -> 'setosa' 100% 분류
|--- feature_2 >  2.45  : 3번 칼럼 분류조건(오른쪽노드) 
|   |--- feature_2 <= 4.75
|   |   |--- class: 1
|   |--- feature_2 >  4.75
|   |   |--- feature_3 <= 1.75
|   |   |   |--- class: 1
|   |   |--- feature_3 >  1.75
|   |   |   |--- class: 2
'''

names = iris.feature_names
'''
Ejemplo n.º 2
0
starting_index = 14

xAttributes = [
    'Developers', 'Commit #', 'Closed Issues', 'Releases', 'Tags',
    'Open Issues', 'Duration', 'Stars', 'Forks', 'Watchers'
]

counts = {}
for attr in xAttributes:
    counts[attr] = 0

for index in range(starting_index, starting_index + sample_count):
    combined_df, myTree = getDecisionTree(index)
    #tree.plot_tree(myTree)
    #plt.show()
    r = export_text(myTree, feature_names=xAttributes, show_weights=True)
    print('index = ' + str(index))
    print(r)
    #print('Importances:')
    for i in range(0, len(xAttributes)):
        imp = myTree.feature_importances_[i]
        feature_name = xAttributes[i]
        #print(feature_name + " = " + str(imp))
        if (imp != 0):
            val = counts.get(feature_name, 0)
            counts[feature_name] = val + 1

print("Attribute wise counts for the sample decision trees")
arr = [(k, counts[k]) for k in sorted(counts, key=counts.get, reverse=True)]
for pr in arr:
    print(pr[0] + ' = ' + str(pr[1]))
def determine_insect_size(arealist, thresh):
    from sklearn.tree import DecisionTreeClassifier  # Import Decision Tree Classifier
    from sklearn.model_selection import train_test_split  # Import train_test_split function
    from sklearn import metrics  #Import scikit-learn metrics module for accuracy calculation
    from sklearn.tree.export import export_text
    import pandas as pd

    numofrects = []
    rectcount = 0
    for i in range(len(arealist)):
        numofrects.append(rectcount)
        rectcount += 1

    ### Load csv ###
    col_names = [
        'No. of Rects', 'Max Area', 'Area Range', 'Standard Deviation',
        'CLASSIFICATION'
    ]
    insectcsv = pd.read_csv("EdgeDetection_InsectSize_DT_Train.csv",
                            header=None,
                            names=col_names)
    feature_cols = [
        'No. of Rects', 'Max Area', 'Area Range', 'Standard Deviation'
    ]
    X = insectcsv[feature_cols]
    y = insectcsv.CLASSIFICATION
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.1,
        random_state=1)  #split into 0.7 train and 0.3 test
    clf = DecisionTreeClassifier(criterion="entropy", max_depth=5)

    # Train Decision Tree Classifer
    clf = clf.fit(X_train, y_train)
    imagearea = thresh.shape[0] * thresh.shape[1]

    arealist = sorted(arealist)

    maxarealistrange = int(len(arealist) * 0.9)
    sixtyarealistrange = int(len(arealist) * 0.6)
    fourtyarealistrange = int(len(arealist) * 0.3)
    minarealistrange = int(len(arealist) * 0.1)
    overlaplist, overlappedrects = get_overlap(rects)

    numofrects = len(rects)
    numofoverlaps = len(overlappedrects)
    differenceinoverlap = numofrects - numofoverlaps

    maxarea = int(statistics.mean(arealist[maxarealistrange:]))
    midarea = int(
        statistics.mean(arealist[fourtyarealistrange:sixtyarealistrange]))
    minarea = int(statistics.mean(arealist[:minarealistrange]))
    arearange = maxarea - minarea
    stdarea = int(statistics.stdev(arealist))

    testing_data = []
    testing_data.append((numofrects, maxarea, arearange, stdarea))

    #Predict the response for test dataset
    y_pred = clf.predict(testing_data)
    if y_pred == 1:
        print("DT Prediction: Big Insects")
    else:
        print("DT Prediction: Small Insects")

    r = export_text(clf, feature_names=feature_cols)
    print(r)

    print(testing_data)

    prediction = []

    if numofrects <= 350:
        prediction.append(1)
    else:
        prediction.append(0)

    if maxarea >= 56000:
        prediction.append(1)
    else:
        prediction.append(0)

    if arearange >= 51000:
        prediction.append(1)
    else:
        prediction.append(0)

    if stdarea >= 20000:
        prediction.append(1)
    else:
        prediction.append(0)

    print(prediction)
    prediction_final = sum(prediction)

    if (prediction_final >= 3):
        print("Processing Big Insects")
        removedoverlappedrects_big = remove_overlapping_rectangles(
            overlaplist, rects)
        bigrects_analysed = analyse_areas_biginsects(
            removedoverlappedrects_big)
        rects_bi, numofrects_bi = draw_rectangles_biginsects(bigrects_analysed)
        save_coordinates_to_xml(filename, numofrects_bi, rects_bi)
    else:
        print("Processing Small Insects")
        bigrectsremoved = analyse_areas_smallinsects(rects)
        mergedrects = merge_rectangles(bigrectsremoved)
        overlaplist_small, overlappedrects_small = get_overlap(mergedrects)
        removedoverlappedrects_small = remove_overlapping_rectangles(
            overlaplist_small, mergedrects)
        rects_si, numofrects_si = draw_rectangles_smallinsects(
            removedoverlappedrects_small)
        save_coordinates_to_xml(filename, numofrects_si, rects_si)
print(y)
display(df)

from sklearn import tree
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree.export import export_text

#enc = OneHotEncoder(handle_unknown='ignore')
#display(df)

clf = tree.DecisionTreeClassifier(max_depth=5)
clf = clf.fit(df, y)
tree.plot_tree(clf.fit(df, y))

print(export_text(clf))

#dotfile = open("dt.dot", 'w')
#tree.export_graphviz(clf, out_file=dotfile, feature_names=df.columns)
#dotfile.close()

#################################################

import numpy as np
import pandas as pd
import io
from IPython.display import clear_output
from matplotlib import pyplot as plt

import requests
url = 'https://raw.githubusercontent.com/SergeGuillemart/Hackathon-BigData-2019/master/ressources/Traite/Finished/Total3.csv'
### Decision Tree using Scikit

import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree.export import export_text

df = pd.read_csv('Iris.csv')
df = df.drop("Id", axis=1)
df = df.rename(columns={"species": "label"})

# Train test split
def train_test_split(df, test_size=0.8, random_state=None):
    train_df = df.sample(frac=test_size, random_state=random_state)
    test_df = df[~df.index.isin(train_df.index)]
    return train_df.sort_index(), test_df.sort_index()

train_df, test_df = train_test_split(df, 0.8, 100)

decision_tree = DecisionTreeClassifier(random_state=0, max_depth=3)

decision_tree = decision_tree.fit(train_df.iloc[:, :-1], train_df.iloc[:, -1])

formated_tree = export_text(decision_tree, feature_names=df.iloc[:, :-1].columns.tolist())

print(formated_tree)

#### Evaluate
decision_tree.score(test_df.iloc[:, :-1], test_df.iloc[:, -1]) * 100

def generar_arbol(codigo, profundidad=5, estadisticas=True):
    """Genera un arbol de decision (clasificador)

    Devuelve el arbol generado y el porcentaje de acierto al validar el modelo con el conjunto de entrenamiento.

    Parametros:
    codigo -- codigo de la estacion de calidad del aire
    profundidad -- profundidad maxima del arbol. Por defecto 5
    estadisticas -- True si se quiere exportar un fichero con estadisticas. False si no se quiere generar. Por defecto True

    
    """

    path = "../data/entrenamiento/" + str(codigo) + "/"

    f = open(path + "data_tree_" + str(codigo) + ".txt")
    cab = str(f.readline())[:-1]
    cab = cab.replace("'", "")
    cab = cab.replace(" ", "")
    cabeceras = cab[1:-1].split(",")
    l_etiquetas = str(f.readline())[:-1].split(",")
    atributos = str(f.readline())[:-1].split(",,")
    f.close()

    l_atributos = []
    for atributo in atributos:
        l_atributos.append(atributo.split(","))

    l_etiquetas = np.array(l_etiquetas)
    l_atributos = np.array(l_atributos)

    etiquetas = set(l_etiquetas)
    clases = sorted(list(etiquetas))

    #Separamos los datos que vamos a utilizar para entrenar y para validar.
    #datos de validacion (0.25) / datos de entrenamiento (0.75)
    X_train, X_test, y_train, y_test = train_test_split(l_atributos,
                                                        l_etiquetas,
                                                        test_size=0.25,
                                                        random_state=0)

    #Normalizacion de los datos
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

    #Creamos el arbol con la profundidad indicada
    arbol = tree.DecisionTreeClassifier(max_depth=profundidad,
                                        criterion='entropy',
                                        random_state=0)

    #Entrenamos el arbol
    arbol.fit(X_train, y_train)

    #Exportar el arbol en texto
    r = export_text(arbol, feature_names=cabeceras[:-1])
    f = open(path + "export_tree_text_" + str(codigo) + ".txt", "w")
    f.write(r)
    f.close()

    #Exportar datos del arbol .dot
    export_graphviz(arbol,
                    out_file=path + 'export_tree_' + str(codigo) + '.dot',
                    class_names=clases,
                    feature_names=cabeceras[:-1],
                    impurity=False,
                    filled=True)

    # Predicion de los resultados del bloque test
    y_pred = arbol.predict(X_test)
    #Matriz de confusion
    cm = confusion_matrix(y_test, y_pred)

    #Reportes
    report = classification_report(y_test, y_pred)

    if estadisticas == True:
        f = open(path + "export_tree_statistics_" + str(codigo) + ".txt", "w")
        f.write("************************ CLASS *************************\n")
        for i in range(len(clases)):
            f.write(str(clases[i]) + "\n")
        f.write("\n\n")
        f.write("********************* MAX DEPTH ************************\n")
        f.write(str(profundidad) + "\n")
        f.write("\n\n")
        f.write("***************** FEATURE IMPORTANCES ******************\n")
        featur_imp = arbol.feature_importances_
        for i in range(len(cabeceras[:-1])):
            f.write(str(cabeceras[i]) + ": " + str(featur_imp[i]) + "\n")
        f.write("\n\n")
        f.write("************************ SCORE *************************\n")
        f.write("With Test data: " + str(arbol.score(X_test, y_test)) + "\n")
        f.write("With Training data: " + str(arbol.score(X_train, y_train)) +
                "\n")
        f.write("\n\n")
        f.write("****************** CONFUSION MATRIX ********************\n")
        if len(clases) == 2:
            f.write("\t0\t1\n")
            f.write("---------------------\n")
            f.write("0 |\t" + str(cm[0][0]) + "\t" + str(cm[0][1]) + "\n")
            f.write("1 |\t" + str(cm[1][0]) + "\t" + str(cm[1][1]) + "\n")
        if len(clases) == 3:
            f.write("\t0\t1\t2\n")
            f.write("--------------------------------\n")
            f.write("0 |\t" + str(cm[0][0]) + "\t" + str(cm[0][1]) + "\t" +
                    str(cm[0][2]) + "\n")
            f.write("1 |\t" + str(cm[1][0]) + "\t" + str(cm[1][1]) + "\t" +
                    str(cm[1][2]) + "\n")
            f.write("2 |\t" + str(cm[2][0]) + "\t" + str(cm[2][1]) + "\t" +
                    str(cm[2][2]) + "\n")
        if len(clases) == 4:
            f.write("\t0\t1\t2\t3\n")
            f.write("----------------------------------------------\n")
            f.write("0 |\t" + str(cm[0][0]) + "\t" + str(cm[0][1]) + "\t" +
                    str(cm[0][2]) + "\t" + str(cm[0][3]) + "\n")
            f.write("1 |\t" + str(cm[1][0]) + "\t" + str(cm[1][1]) + "\t" +
                    str(cm[1][2]) + "\t" + str(cm[1][3]) + "\n")
            f.write("2 |\t" + str(cm[2][0]) + "\t" + str(cm[2][1]) + "\t" +
                    str(cm[2][2]) + "\t" + str(cm[2][3]) + "\n")
            f.write("3 |\t" + str(cm[3][0]) + "\t" + str(cm[3][1]) + "\t" +
                    str(cm[3][2]) + "\t" + str(cm[3][3]) + "\n")
        f.write("\n\n")
        f.write("************************ REPORT ***********************\n")
        f.write(report)
        f.close()

    print(str(codigo) + ": tree created")
    return (arbol, arbol.score(X_test, y_test))
Ejemplo n.º 7
0
    train_loader = DataLoader(training, batch_size=train_size, shuffle=True)
    val_loader = DataLoader(val, batch_size=val_size, shuffle=False)
    print("Dataset loaded!")
    trees = []
    for i, (feature, label) in enumerate(train_loader):
        for max_depth in (2, 3, 5, 7, 10):
            print("Max depth: {}".format(max_depth))
            clf = tree.DecisionTreeClassifier(max_depth=max_depth)
            clf = clf.fit(np.array(feature), np.array(label))
            training_predictions = clf.predict(np.array(feature))
            training_accuracy = metrics.accuracy_score(np.array(label), training_predictions)
            for j, (val_feature, val_label) in enumerate(val_loader):
                validation_predictions = clf.predict(np.array(val_feature))
                validation_accuracy = metrics.accuracy_score(np.array(val_label), validation_predictions)
                confusion_matrix = str(metrics.confusion_matrix(np.array(val_label), validation_predictions))
            print(training_accuracy)
            print(validation_accuracy)
            print(confusion_matrix)
            import pdb
            pdb.set_trace()
            r = export_text(clf, feature_names=('max_speed', 'max_dec', 'max_acc', 'max_acc_total'))
            print(r)
    # model = SafetyFeatureModel(4, 2)
    # weight = torch.tensor([1, 3], dtype=torch.float)
    # if torch.cuda.is_available():
    #     model.cuda()
    #     weight = weight.cuda()
    # loss_function = nn.CrossEntropyLoss(weight)
    # optimizer = optim.SGD(model.parameters(), lr=0.00001)
    # train(model, train_loader, val_loader, loss_function, optimizer, 500)
Ejemplo n.º 8
0
extended_features_names = [
    '(!!data.breathingProblems or !!data.fever or !!data.cough)',
    '(!!data.breathingProblems and !!data.fever and !!data.cough)'
]

def compute_extended_features(features):
    return [
        min(1, features[1] + features[2] + features[3]),
        features[1] * features[2] * features[3]
    ]



data = []
classes = []
with open('./tree.tsv', 'r') as f:
    for line in f:
        split = line.rstrip().split('\t')
        features = [1 if val == "Oui" else 0 for val in split[:-1]]
        extended_features = compute_extended_features(features)
        data.append(features + extended_features)
        classes.append(classes_indexes[split[-1]])


clf = tree.DecisionTreeClassifier(criterion='gini')
clf = clf.
res = export_text(clf, feature_names=base_features_names + extended_features_names)
res = res.replace('---', 'if ').replace(' >  0.50', ' is true:').replace('<= 0.50', 'is false:')
print(res)
Ejemplo n.º 9
0
train_target = np.delete(iris.target, test_idx)
train_data = np.delete(iris.data, test_idx, axis=0)

# test data
test_target = iris.target[test_idx]
test_data = iris.data[test_idx]

classifier = tree.DecisionTreeClassifier()
classifier.fit(train_data, train_target)

print(test_target)
print(classifier.predict(test_data))

#dot_data = tree.export_graphviz(classifier, out_file=None,
#                feature_names=iris.feature_names,
#                class_names=iris.target_names,
#                filled=True, rounded=True,
#                special_characters=True)

#graph = graphviz.Source(dot_data)
#graph.render("iris")

print(iris.feature_names, iris.target_names)
print(test_data[0], test_target[0])
print(test_data[1], test_target[1])
print(test_data[2], test_target[2])

r = export_text(classifier, feature_names=iris['feature_names'])
print("\n\n Text Decision Tree\n ******************\n")
print(r)
Ejemplo n.º 10
0
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)
# To plot tree
tree.plot_tree(clf)

# To export tree in pdf file
dot_data = tree.export_graphviz(clf, out_file=None)
graph = graphviz.Source(dot_data)
graph.render("D:/diabeteas")

# To plot tree in colored format
# Insted of X_train we cam use feature_cols also directly
dot_data = tree.export_graphviz(clf,
                                out_file=None,
                                feature_names=X_train.columns,
                                class_names=pima.label,
                                filled=True,
                                rounded=True,
                                special_characters=True)
graph = graphviz.Source(dot_data)
graph

# To export tree in text format
r = export_text(clf, feature_names=feature_cols)

print(r)
Ejemplo n.º 11
0
def trea(X, y):
    decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2)
    decision_tree = decision_tree.fit(X, y)
    r = export_text(decision_tree)
    print(r)
    def classify_reviews(self):
        """
        Classifies each review into either iOS 13-related or not iOS 13-related using 
        a naive REGEX approach and a more sophisticated machine learning approach.
        """
        df = pd.read_csv(DataProcessor.dir_name + "/../../data_files/processed_dataset.csv")
        df = df.fillna(value={'document': " "})
        df["re_flag"] = None # regular expression flag
        df["ml_flag"] = None # machine learning flag
        
        # regular expression approach for explicit iOS 13 references
        regex_string = r"((ios|iso|ois|osi|sio|soi|os)\s*13)|(13\s*(ios|iso|ois|osi|sio|soi|os))"
        df["re_flag"] = df.document.str.contains(regex_string,regex=True)
  
        # machine learning approach for implicit iOS 13 references
        # create training data set
        data = df
        sample_size = 21700
        data = pd.concat([data.sample(sample_size), 
                             data[data["re_flag"] == True]]).drop_duplicates().reset_index(drop=True)
        data = shuffle(data)
 
        # create tfidf vector as feature vector and another class label vector
        tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
        x_tfidf_vector = tfidf_vectorizer.fit_transform(data["document"])
        y_vector = np.where(data['re_flag']==True, 1, 0)
        
        # create the same vectors for the entire dataset
        data = shuffle(df)
        data = data.fillna(value={'document': " "})
        x = tfidf_vectorizer.transform(data["document"])
        y = np.where(data['re_flag']==True, 1, 0)       
        
        # split data into train and test set
        x_train, x_test, y_train, y_test = train_test_split(x_tfidf_vector, y_vector, 
                                                    test_size=0.2, shuffle=False)
        
        # train multinomial naive Bayes model
        nb = MultinomialNB().fit(x_train, y_train)
        y_predicted = nb.predict(x_test)
        print("MultinomialNB")
        print(classification_report(y_test,y_predicted))
        print(pd.crosstab(y_test, y_predicted, rownames=['True'], colnames=['Predicted'], margins=True))
        print(accuracy_score(y_test, y_predicted))
        y_hats = nb.predict(x)
        print(classification_report(y,y_hats))
        print(pd.crosstab(y,y_hats, rownames=['True'], colnames=['Predicted'], margins=True))
        print(accuracy_score(y,y_hats))        
        
        # train logistic regression model
        lr = LogisticRegression(solver='lbfgs').fit(x_train, y_train)
        y_predicted = lr.predict(x_test)
        print("LogisticRegression")
        print(classification_report(y_test,y_predicted))
        print(pd.crosstab(y_test, y_predicted, rownames=['True'], colnames=['Predicted'], margins=True))
        print(accuracy_score(y_test, y_predicted))
        y_hats = lr.predict(x)
        print(classification_report(y,y_hats))
        print(pd.crosstab(y,y_hats, rownames=['True'], colnames=['Predicted'], margins=True))
        print(accuracy_score(y,y_hats))
        
        # train random forest model
        rf = RandomForestClassifier(n_estimators=50).fit(x_train, y_train)
        y_predicted = rf.predict(x_test)
        print("RandomForestClassifier")
        print(classification_report(y_test,y_predicted))
        print(pd.crosstab(y_test, y_predicted, rownames=['True'], colnames=['Predicted'], margins=True))
        print(accuracy_score(y_test, y_predicted))
        y_hats = rf.predict(x)
        print(classification_report(y,y_hats))
        print(pd.crosstab(y,y_hats, rownames=['True'], colnames=['Predicted'], margins=True))
        print(accuracy_score(y,y_hats))
        feature_names = tfidf_vectorizer.get_feature_names()
        # print randomly choosen decision tree of random forest model
        estimator = rf.estimators_[random.randrange(0, 50)]
        tree_rules = export_text(estimator, feature_names=feature_names)
        print(tree_rules)
 
        # conduct five-fold cross validation
        models = [
            RandomForestClassifier(n_estimators=50),
            MultinomialNB(),
            LogisticRegression(),
        ]
        cv_fold = 5
        cv_df = pd.DataFrame(index=range(cv_fold * len(models)))
        entries = []
        for model in models:
          model_name = model.__class__.__name__
          accuracies = cross_val_score(model, x_tfidf_vector, y_vector, 
                                       scoring='accuracy', cv=cv_fold)
          for fold_idx, accuracy in enumerate(accuracies):
              entries.append((model_name, fold_idx, accuracy))
        cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
        
        # visualization of results
        # print boxplots with model accuracies
        sns.set(rc={'figure.figsize':(14,6)})   
        sns.set_style('whitegrid', {'font.family':'serif', 'font.serif':'Times New Roman'})
        plot = sns.boxplot(x='accuracy', y='model_name', color="0.90", data=cv_df, 
                    order=["MultinomialNB","LogisticRegression","RandomForestClassifier"],
              orient="h", linewidth=3)
        
        sns.swarmplot(x='accuracy', y='model_name', data=cv_df, 
              size=10, edgecolor="gray", color="black", 
              linewidth=1, order=["MultinomialNB", "LogisticRegression",
                                  "RandomForestClassifier"],
                                  orient="h")
        plot.set_xlabel("Accuracy",fontsize=25)
        plot.set_ylabel("Model name",fontsize=25)
        plot.tick_params(labelsize=20)
           
        # store predictions of the two classification approaches       
        data.loc[:,"ml_flag"] = y_hats # machine learning predictions
        data["re_flag"] = data["re_flag"].astype(int) # regex predictions
        data = data.reset_index(drop=True)
        
        data.to_csv(DataProcessor.dir_name + "/../../data_files/processed_dataset.csv", encoding="utf-8", index=False)
        return df
Ejemplo n.º 13
0
#noBlanks = df[df.apply(lambda x: x.count(), axis=1) > 44]

x = df[xCols]
y = df["Stuck"]

clf = tree.DecisionTreeClassifier(max_depth=3)
clf = clf.fit(x, y)

fig, ax = plt.subplots()
fig.set_figheight(12)
fig.set_figwidth(15)
tree.plot_tree(clf, ax=ax)

#dot_data = tree.export_graphviz(clf, out_file=None)
#graph = graphviz.Source(dot_data)
r = export_text(clf, feature_names=xCols)
print(r)

print(confusion_matrix(y, clf.predict(x)))


# Compute ROC curve and ROC area for each class
fpr = []
tpr = []
roc_auc = dict()
plt.figure()

# ROC curve for different max depths for tree
for i in range(2,10):
    clf = tree.DecisionTreeClassifier(max_depth=i)
    clf = clf.fit(x, y)
Ejemplo n.º 14
0
clf.predict([[2., 2.]])

clf.predict_proba([[2., 2.]])

X, y = load_iris(return_X_y=True)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)

tree.plot_tree(clf.fit(iris.data, iris.target))

dot_data = tree.export_graphviz(clf, out_file=None)
graph = graphviz.Source(dot_data)
graph.render("iris")

dot_data = tree.export_graphviz(clf,
                                out_file=None,
                                feature_names=iris.feature_names,
                                class_names=iris.target_names,
                                filled=True,
                                rounded=True,
                                special_characters=True)
graph = graphviz.Source(dot_data)
graph

iris = load_iris()
decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2)
decision_tree = decision_tree.fit(iris.data, iris.target)
r = export_text(decision_tree, feature_names=iris['feature_names'])
print(r)
print('Accuracy: %.2f%%' % (100.0 * tree_model.score(X_test, y_test)))


# decision boundary
X_combined = np.vstack((X_train, X_test))
y_combined = np.hstack((y_train, y_test))
train_len = X_train.shape[0]
combined_len = X_combined.shape[0]

plt.figure(figsize=(3, 3), dpi=300)
plot_decision_regions(X=X_combined, y=y_combined, classifier=knn_model, test_idx=range(train_len, combined_len))
plt.xlabel('petal length [cm]')
plt.ylabel('petal width [cm]')
plt.legend(loc='upper left')
plt.tight_layout()
#plt.savefig('images/03_01.png', dpi=300)
plt.show()

# plot the tree
from sklearn.tree.export import export_text
print(export_text(tree_model, feature_names=list(X.columns)))

from sklearn.tree import plot_tree
plt.figure(figsize=(6, 6), dpi=300)
plot_tree(tree_model, filled=True)
plt.show()

# Attribute Importance
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(tree_model.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False)
Ejemplo n.º 16
0
#set up matplot figure
fig, ax = plt.subplots(figsize=(8, 8))
#plot the decision tree
tree.plot_tree(clf_final, feature_names=list(x_train), filled=True)
plt.savefig('{}/decision_tree_{}.png'.format(path, args.train),
            dpi=400)  #save the figure
#alternative way to plot the decision tree by manually entering the size and colour of the arrows
# out = tree.plot_tree(clf_final, feature_names=list(x_train), filled = True)
# for o in out:
#     arrow = o.arrow_patch
#     if arrow is not None:
#         arrow.set_edgecolor('black')
#         arrow.set_linewidth(3)

#plot and output the decision tree as text
tree_rules = export_text(clf_final, feature_names=list(X.columns))
print(tree_rules)

#extract feature importances
fi = pd.DataFrame({'feature': list(x_train.columns),
                   'importance': clf_final.feature_importances_}).\
                    sort_values('importance', ascending = False)

print(fi.head())

#set up matplot figure
fig = plt.figure(figsize=(8, 8))
#plot bar chart showing feature importances
sns.barplot(x=fi.feature, y=fi.importance)
#add labels to the graph
plt.xlabel('Features')  #rename the x-axis title