def describe_model(model, X, Y, log, plot_all): predictor_names = X.columns.values.tolist() log += print_and_log("Model:") log += print_and_log(model) log += print_and_log("Predictor variables:") for p in predictor_names: log += print_and_log("\t{}".format(p)) # Export tree SVG if plot_all: from dtreeviz.trees import dtreeviz log += print_and_log("\nExport tree to SVG:") viz = dtreeviz( model, X.values, Y.values.ravel(), target_name="perf", feature_names=predictor_names, ) viz.save("trytree.svg") viz.view() return log
def viz_digits(orientation="TD", max_depth=3, random_state=666, fancy=True, pickX=False): clf = tree.DecisionTreeClassifier(max_depth=max_depth, random_state=random_state) digits = load_digits() # "8x8 image of integer pixels in the range 0..16." columns = [f'pixel[{i},{j}]' for i in range(8) for j in range(8)] clf.fit(digits.data, digits.target) X = None if pickX: X = digits.data[np.random.randint(0, len(digits.data)), :] viz = dtreeviz(clf, digits.data, digits.target, target_name='number', feature_names=columns, orientation=orientation, class_names=[chr(c) for c in range(ord('0'), ord('9') + 1)], fancy=fancy, histtype='bar', X=X) return viz
def viz_tree(clf,Xdata,Ydata,label,outfile,point=None): from dtreeviz.trees import dtreeviz from sklearn.tree import export_graphviz X = Xdata.pd Y = Ydata.pd X_headers = X.columns Y_headers = Y.columns # If "point" given as argument, find the closest datapoint to "point", will do treewalk for this observation if(point is not None): points = Xdata.vtk.points dist = points-point dist = np.sqrt(dist[:,0]**2.0 + dist[:,1]**2.0 + dist[:,2]**2.0) loc = np.argmin(dist) print('point = ', point) print('nearest point = ',points[loc,:]) print('distance = ',dist[loc]) datapoint = X.iloc[loc] else: datapoint = None # Extract the classifier object from the clf multilearn object index = Y_headers.to_list().index(label) clf = clf.classifiers_[index] # TODO: check if clf is a decision tree viz = dtreeviz(clf, X, Y[label], feature_names=X_headers, target_name=label,class_names=["False","True"],X=datapoint) viz.save(outfile)
def viz_boston(orientation="TD", max_depth=3, random_state=666, fancy=True): regr = tree.DecisionTreeRegressor(max_depth=max_depth, random_state=random_state) boston = load_boston() regr = regr.fit(boston.data, boston.target) X = boston.data[np.random.randint(0, len(boston.data)), :] print(boston.feature_names) features = np.array([ 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT' ]) viz = dtreeviz(regr, boston.data, boston.target, target_name='price', feature_names=features, orientation=orientation, fancy=fancy, X=X) export_graphviz(regr, out_file="/tmp/boston-scikit-tree.dot", filled=True, rounded=True, special_characters=True) return viz
def viz_diabetes(orientation="TD", max_depth=3, random_state=666, fancy=True, pickX=False): diabetes = load_diabetes() regr = tree.DecisionTreeRegressor(max_depth=max_depth, random_state=random_state) regr.fit(diabetes.data, diabetes.target) X = None if pickX: X = diabetes.data[np.random.randint(0, len(diabetes.data)), :] viz = dtreeviz(regr, diabetes.data, diabetes.target, target_name='progr', feature_names=diabetes.feature_names, orientation=orientation, fancy=fancy, X=X) return viz
def viz_knowledge(orientation="TD", max_depth=3, random_state=666, fancy=True): # data from https://archive.ics.uci.edu/ml/datasets/User+Knowledge+Modeling clf = tree.DecisionTreeClassifier(max_depth=max_depth, random_state=random_state) know = pd.read_csv("data/knowledge.csv") target_names = ['very_low', 'Low', 'Middle', 'High'] know['UNS'] = know['UNS'].map({n: i for i, n in enumerate(target_names)}) X_train, y_train = know.drop('UNS', axis=1), know['UNS'] clf = clf.fit(X_train[['PEG', 'LPR']], y_train) X = X_train.iloc[np.random.randint(0, len(know))] viz = dtreeviz( clf, X_train[['PEG', 'LPR']], y_train, target_name='UNS', feature_names=['PEG', 'LPR'], orientation=orientation, class_names=target_names, # show_node_labels=True, histtype='strip', fancy=fancy) return viz
def visualize(clf): (train, target) = read_train_data() viz = dtreeviz(clf, train, target, target_name='HSV', feature_names=['Hue', 'Saturation', 'Value'], class_names=['Yellow', 'Green', 'Red', 'Blue', 'Black', 'White'] ) viz.view()
def export_tree_advanced(self,X,Y,feature_names,filename_out): viz = dtreeviz(self.model, X, Y, feature_names=feature_names,colors={'classes': [None, None,['#0080FF80', '#FF800080']]}) viz.save(filename_out) ext = filename_out.split('.')[-1] name = filename_out.split('.' + ext)[0] if os.path.isfile(name): os.remove(name) return # ----------------------------------------------------------------------------------------------------------------------
def interpret( self, X_train, y_train, X_validation, y_validation, model_file_path, learner_name, target_name=None, class_names=None, metric_name=None, ml_task=None, explain_level=2, ): super(DecisionTreeAlgorithm, self).interpret( X_train, y_train, X_validation, y_validation, model_file_path, learner_name, target_name, class_names, metric_name, ml_task, explain_level, ) if explain_level == 0: return with warnings.catch_warnings(): warnings.simplefilter(action="ignore") try: if len(class_names) > 10: # dtreeviz does not support more than 10 classes return viz = dtreeviz( self.model, X_train, y_train, target_name="target", feature_names=X_train.columns, class_names=class_names, ) tree_file_plot = os.path.join(model_file_path, learner_name + "_tree.svg") viz.save(tree_file_plot) except Exception as e: logger.info( f"Problem when visualizing decision tree. {str(e)}") save_rules(self.model, X_train.columns, class_names, model_file_path, learner_name)
def explain(skater_exp: Explanation, training_df, test_df, explanation_target, prefix_target): job = skater_exp.job model = joblib.load(job.predictive_model.model_path) model = model[0] features = list(training_df.drop(['trace_id', 'label'], 1).columns.values) interpreter = Interpretation(training_df, feature_names=features) X_train = training_df.drop(['trace_id', 'label'], 1) Y_train = training_df['label'].values model_inst = InMemoryModel(model.predict, examples=X_train, model_type=model._estimator_type, unique_values=[1, 2], feature_names=features, target_names=['label']) surrogate_explainer = interpreter.tree_surrogate(model_inst, seed=5) surrogate_explainer.fit(X_train, Y_train, use_oracle=True, prune='post', scorer_type='default') surrogate_explainer.class_names = features viz = dtreeviz(surrogate_explainer.estimator_, X_train, Y_train, target_name='label', feature_names=features, orientation="TD", class_names=list(surrogate_explainer.class_names), fancy=True, X=None, label_fontsize=12, ticks_fontsize=8, fontname="Arial") name = create_unique_name("skater_plot.svg") viz.save(name) if os.path.getsize(name) > 15000000: return 'The file size is too big' f = open(name, "r") response = f.read() os.remove(name) if os.path.isfile(name.split('.svg')[0]): os.remove(name.split('.svg')[0]) return response
def viz_iris(orientation="TD", max_depth=5, random_state=666, fancy=True): clf = tree.DecisionTreeClassifier(max_depth=max_depth, random_state=random_state) iris = load_iris() data = pd.DataFrame(iris.data) data.columns = iris.feature_names clf = clf.fit(data, iris.target) # for i in range(len(iris.data)): for i in [60]: x = data.iloc[i] pred = clf.predict([x.values]) shadow_tree = ShadowDecTree( clf, iris.data, iris.target, feature_names=iris.feature_names, class_names=["setosa", "versicolor", "virginica"]) pred2 = shadow_tree.predict(x.values) print( f'{x} -> {pred[0]} vs mine {pred2[0]}, path = {[f"node{p.feature_name()}" for p in pred2[1]]}' ) path = [n.id for n in pred2[1]] if pred[0] != pred2[0]: print("MISMATCH!") features = list(data.columns) features = np.array([ 'sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)' ]) st = dtreeviz( clf, iris.data, iris.target, target_name='variety', feature_names=features, orientation=orientation, class_names=["setosa", "versicolor", "virginica"], # 0,1,2 targets #histtype='strip', fancy=fancy, X=x) return st
def simple_dtree(df, x_list, y_var, max_depth=3, regressor=False, min_samples_split=2, test_size=0.3): """ instant DecisionTree model """ from sklearn.tree import DecisionTreeRegressor from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from dtreeviz.trees import dtreeviz X = df[x_list] y = df[y_var] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, test_size=test_size) # build model, and fitting if regressor: model = DecisionTreeRegressor(max_depth=max_depth, random_state=123, min_samples_split=min_samples_split) else: model = DecisionTreeClassifier(max_depth=max_depth, random_state=123, min_samples_split=min_samples_split) model.fit(X_train, y_train) # plot print('Train score: {:.3f}'.format(model.score(X_train, y_train))) print('Test score: {:.3f}'.format(model.score(X_test, y_test))) viz = dtreeviz( model, X_train=X, y_train=y, target_name=y_var, feature_names=x_list, precision=2, class_names=None if model.classes_ is None else model.classes_.tolist(), ) viz.view() return model
def weird_binary_case(): # See bug https://github.com/parrt/dtreeviz/issues/17 import numpy as np from sklearn.tree import DecisionTreeClassifier from dtreeviz.trees import dtreeviz x = np.random.choice([-1, 1], size=(100, 2)) y = np.random.choice([0, 1], size=100) viz = dtreeviz(tree_model=DecisionTreeClassifier(max_depth=1).fit(x, y), X_train=x, y_train=y, feature_names=['a', 'b'], target_name='y', class_names=[1, 0]) return viz
def interpret( self, X_train, y_train, X_validation, y_validation, model_file_path, learner_name, target_name=None, class_names=None, metric_name=None, ml_task=None, explain_level=2, ): super(DecisionTreeRegressorAlgorithm, self).interpret( X_train, y_train, X_validation, y_validation, model_file_path, learner_name, target_name, class_names, metric_name, ml_task, explain_level, ) if explain_level == 0: return try: viz = dtreeviz( self.model, X_train, y_train, target_name="target", feature_names=X_train.columns, ) tree_file_plot = os.path.join(model_file_path, learner_name + "_tree.svg") viz.save(tree_file_plot) except Exception as e: logger.info( f"Problem when visuzalizin decision tree regressor. {str(e)}") save_rules(self.model, X_train.columns, None, model_file_path, learner_name)
def explore_prediction(): """Interfaz interactiva para ver como se hace una predicción al azar.""" x_sample = df.sample() display(x_sample) viz = dtreeviz.dtreeviz( clf, X, y, target_name='smoker', feature_names=list(X.columns), class_names=list(y_encoder.classes_), scale=1.0, X=x_sample[X.columns].iloc[0].values, ) display(viz)
def interpret(self, X, y, model_file_path, learner_name, target_name=None, class_names=None): try: viz = dtreeviz(self.model, X, y, target_name="target", feature_names=X.columns) self._tree_file_plot = os.path.join(model_file_path, learner_name + "_tree.svg") viz.save(self._tree_file_plot) except Exception as e: self._tree_file_plot = None
def visualize_decision_tree(self, X_train, Y_train, feature_names, max_depth=3): X_train = self.sample_like(X_train, frac=self.sample_frac) Y_train = self.sample_like(Y_train, frac=self.sample_frac) Y_train = self.ravel_like(Y_train) clf = DecisionTreeClassifier(max_depth=max_depth) clf.fit(X_train, Y_train) for pred_col in self.pred_cols: viz = dtreeviz(clf, X_train, Y_train, target_name=pred_col, feature_names=feature_names, class_names=list(set([str(i) for i in Y_train]))) display(viz)
def getDtreeVizImg(rand_state=42, max_depth=3): fname = BASE_IMG_PATH + "dtv" + str(rand_state) + '_' + str( max_depth) + '.png' if (not os.path.exists(fname)): X = df.drop(['Class'], axis=1) y = df['Class'] # Split the training and test dataset. # Since in this program the focus is not on predictions # but on creating a new style decision tree, the test set is unused X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=rand_state) # initialise the decision tree model DecsTreeModel = DecisionTreeClassifier(criterion='entropy', max_depth=max_depth) # train the model with the training set DecsTreeModel.fit(X_train, y_train) # Creating the dtreeviz style decision tree viz = dtreeviz(DecsTreeModel, X_train, y_train, feature_names=list(X.columns.values), class_names=['0', '1']) # save the decision tree image in SVG format viz.save(BASE_IMG_PATH + "svgtempfile.svg") out = BytesIO() #convert the decision tree image from SVG to PNG format cairosvg.svg2png(url=BASE_IMG_PATH + "svgtempfile.svg", write_to=out) dtree = Image.open(out) dtree.save(fname) with open(fname, "rb") as image: img = image.read() # The Base 64 format is useful for serialization of image data. # Doing this enables the image data to be saved in peristent storage or transfer it over network. barray = base64.b64encode(img).decode('utf-8') return barray
def run_single_tree(X_train, y_train, X_test, y_test, depth, plot_tree, feature_cols): model = tree.DecisionTreeRegressor(max_depth=depth).fit(X_train, y_train) accuracy_train = model.score(X_train, y_train) accuracy_test = model.score(X_test, y_test) y_predictions = model.predict(X_test) y_predictions_series = pandas.Series(y_predictions, index=y_test.index) def resid(row): return row["true"] - row["pred"] resid_df = pandas.concat([y_test, y_predictions_series], keys=["true", "pred"], axis=1) resid_df["residual"] = resid_df.apply(resid, axis=1) plt.scatter(resid_df["true"][:round(len(resid_df) / 30)], resid_df["residual"][:round(len(resid_df) / 30)], alpha=0.5) plt.savefig(f"plots/residplot-depth-{depth}.png") plt.clf() print('Single tree depth: ', depth) print('Accuracy, Training Set: ', round(accuracy_train * 100, 5), '%') print('Accuracy, Test Set: ', round(accuracy_test * 100, 5), '%') print('RMSE Tree, Test Set: ', sqrt(mean_squared_error(y_test, y_predictions))) if plot_tree: viz = dtreeviz(model, X_train, y_train, fancy=False, target_name='chance', feature_names=feature_cols) viz.save("plots/plot.svg") return accuracy_train, accuracy_test
def produce_tree_visualization(tree, tree_index, x, y, target_name, feature_names, class_names, model_uid): """ Produces visualization of a decision tree from an ensemble. :param tree: tree model :param tree_index: index of the tree in the ensemble :param x: predictor matrix :param y: target series :param target_name: name of the target :param feature_names: list of feature names :param class_names: name of the target classes :param model_uid: name of the model """ viz = dtreeviz(tree.estimators_[tree_index], x, y, target_name=target_name, feature_names=feature_names, class_names=class_names) viz.save( os.path.join('modeling', model_uid, 'diagnostics', 'trees', f'decision_tree_{tree_index}.svg'))
def create_tree(df, max_depth, target='admitted', **kwargs): X = df.drop(columns=[target]) y = df[target] tree = DecisionTreeClassifier( max_depth=max_depth, **kwargs, ) tree.fit(X, y) rep = report(y, tree.predict(X)) viz = dtreeviz( tree, X, y, target_name='admitted', feature_names=X.columns, class_names=['no', 'yes'], ) return tree, rep, viz
def describe_hpo(gs, X, Y, log, plot_all): predictor_names = X.columns.values.tolist() log += print_and_log( "\n----------------------------------------------------------------------------" ) log += print_and_log("Available predictor variables:") for p in predictor_names: log += print_and_log("\t{}".format(p)) log += print_and_log("\nBest parameters set found on development set:") for bestpar_name, bestpar_value in gs.best_params_.items(): log += print_and_log("\t{}: {}".format(bestpar_name, bestpar_value)) log += print_and_log("\nBest estimator:") best_estimator = gs.best_estimator_._final_estimator log += print_and_log(best_estimator) log += print_and_log( "----------------------------------------------------------------------------" ) # Export tree SVG if plot_all: from dtreeviz.trees import dtreeviz log += print_and_log("\nExport tree to SVG:") viz = dtreeviz( best_estimator, X.values, Y.values.ravel(), target_name="perf", feature_names=predictor_names, ) viz.save("trytree.svg") viz.view() return log
def viz_wine(orientation="TD", max_depth=3, random_state=666, fancy=True, pickX=False): clf = tree.DecisionTreeClassifier(max_depth=max_depth) wine = load_wine() X_train = wine.data y_train = wine.target clf.fit(X_train, y_train) X = None if pickX: X = X_train[np.random.randint(0, len(X_train.data)), :] viz = dtreeviz(clf, wine.data, wine.target, target_name='wine', feature_names=wine.feature_names, class_names=list(wine.target_names), X=X) # pass the test observation return viz
def interpret(self, X, y, model_file_path, learner_name, target_name=None, class_names=None): try: if len(class_names) > 10: # dtreeviz does not support more than 10 classes return viz = dtreeviz( self.model, X, y, target_name="target", feature_names=X.columns, class_names=class_names, ) self._tree_file_plot = os.path.join(model_file_path, learner_name + "_tree.svg") viz.save(self._tree_file_plot) except Exception as e: self._tree_file_plot = None
from dtreeviz.trees import dtreeviz from sklearn import tree from sklearn.datasets import load_wine wine = load_wine() classifier = tree.DecisionTreeClassifier(max_depth=2) classifier.fit(wine.data, wine.target) vis = dtreeviz( classifier, wine.data, wine.target, target_name="wine_type", feature_names=wine.feature_names, ) vis.view()
# .set_axis(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', # 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', # 'X21', 'X22', 'X23', 'Y'], axis=1)\ # .to_csv('csv/accomodation_data.csv') # 4 決定木によるモデル構築 -------------------------------------------------------- # モデル構築 # --- 決定木による分類器 clf = DecisionTreeClassifier(max_depth=2) clf = clf.fit(X=features, y=data_o) # 確認 vars(clf) # 5 決定木の可視化 --------------------------------------------------------------- # indexの抽出 time_index = df_info.resample('M').count().index # 決定木を描画 viz = dtreeviz( clf, features, data_o, target_name='Class', feature_names=time_index, class_names=['False', 'True'], ) viz
#print representation text_representation = tree.export_text(clf) print(text_representation) #save to a file with open('decisiontree1.log','w') as fout : fout.write(text_representation) #plottree tree.plot_tree(decision_tree=clf) fig = plt.figure(figsize=(10,8)) _ = tree.plot_tree(clf, feature_names= iris.feature_names, class_names=iris.target_names, filled=True) #see plot import graphviz dot_data = tree.export_graphviz(decision_tree=clf, out_file=None, feature_names = iris.feature_names, class_names = iris.target_names, filled=True) graph = graphviz.Source(dot_data, format='png') #error dot path import os os.environ["PATH"] += os.pathsep + 'c:/Program Files (x86)/Graphviz2.38/bin/' graph = graphviz.Source(dot_data, format='png') graph graph.render('DecisionTree.png ') #%%%Plot DT with dtreeviz #pip install dtreeviz from dtreeviz.trees import dtreeviz viz = dtreeviz(clf, X,y, target_name='target', feature_names = iris.feature_names, class_names = list(iris.target_names)) viz viz.save('dt2.svg')
import pandas as pd from sys import argv from dtreeviz.trees import dtreeviz from sklearn.metrics import f1_score from sklearn.tree import DecisionTreeClassifier data = pd.read_csv(argv[1]) cols = list(data.columns) full = cols[1:-4] # the first one is the date and the last four are the labels labels = cols[-4] # these are the raw labels (0, 1, 2) names = [] for f in full: if f in argv or len(argv) < 3: names.append(f) X = data[names] y = data[labels] print(names) dt = DecisionTreeClassifier() model = dt.fit(X, y) v = dtreeviz(model, X, y, target_name='trend', feature_names=names, class_names=['decrease', 'stable', 'increase']) v.save('dt.svg') print(f1_score(y, dt.predict(X), average='weighted'))
#正答率を求める pre1 = clf_result.predict(X_train) ac_score1 = metrics.accuracy_score(y_train, pre1) print("トレーニングデータ正答率 = ", ac_score1) pre2 = clf_result.predict(X_test) ac_score2 = metrics.accuracy_score(y_test, pre2) print("テストデータ正答率 = ", ac_score2) #重要度の可視化 features = [] for s in range(len(X.columns)): features.append(X.columns[s]) n_features = X.shape[1] plt.barh(range(n_features), clf_result.feature_importances_, align="center") plt.yticks(np.arange(n_features), features) plt.xlabel("importance") plt.ylabel("Feature value") plt.savefig("barh.png") #決定木の可視化 viz = dtreeviz(clf_result, X.values, y.values, target_name="variety", feature_names=features, class_names=["0", "1", "2", "3"]) viz.view()
# DOT data dot_data = tree.export_graphviz(clf, out_file=None, feature_names=iris.feature_names, class_names=iris.target_names, filled=True) # Draw graph graph = graphviz.Source(dot_data, format="png") graph #graph.render("decision_tree_graphivz") # Plot Decision Tree with dtreeviz Package from dtreeviz.trees import dtreeviz # remember to load the package viz = dtreeviz(clf, X, y, target_name="target", feature_names=iris.feature_names, class_names=list(iris.target_names)) viz ################## REGRESSION TASK ####################### # Visualizing the Decision Tree in Regression Task from sklearn import datasets from sklearn.tree import DecisionTreeRegressor from sklearn import tree # Prepare the data data boston = datasets.load_boston() X = boston.data