def applyDecisionTree(trainData, trainTargets, testData, testTargets, featureNames): """Train and classify using a Decision Tree and prints the decision Tree.""" decisionTree = DecisionTreeClassifier() model = decisionTree.fit(trainData, trainTargets) # Create graph description of the Decision Tree dot_data = StringIO() #export_graphviz(model, out_file=dot_data, max_depth=5) print("Feature names:", featureNames) export_graphviz(model, out_file=dot_data, feature_names=featureNames, max_depth=5) export_graphviz(model, out_file="DecisionTree.dot", feature_names=featureNames, max_depth=5) #with open("DecisionTree.dot", 'r') as dotFile: # dotFile.write(exportFile) # Create PDF from dot graph = pydot.graph_from_dot_data(dot_data.getvalue()) #path = "/Users/konstantin/Documents/University/Bachelorthesis/paper/src/DecisionTree.dot" #graph = pydot.graph_from_dot_file(path) #graph.write_pdf("DecisionTree.pdf") classification = [model.predict(d)[0] for d in testData] print("\nUsing a Decision Tree:") showPerformance(testTargets, classification)
def export_tree(forest): i_tree = 0 for tree_in_forest in forest.estimators_: with open('trees/tree_' + str(i_tree) + '.dot', 'w') as my_file: tree.export_graphviz(tree_in_forest, out_file=my_file, feature_names=utils.heatmap_feature_names[:len(utils.heatmap_feature_names) - 1]) i_tree += 1
def visualize_tree(clf, outname, headers): from sklearn.externals.six import StringIO import pydot dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data, feature_names=list(headers)) graph = pydot.graph_from_dot_data(dot_data.getvalue().decode('latin1').encode('utf8')) graph.write_pdf(outname)
def create_tree(args): # load df = pd.read_csv(args['<input>']) class_attr = args['<class>'] # check if class_attr not in df.columns: print('Class attribute "{}" not in dataset!'.format(class_attr)) sys.exit(1) # flags & options verbose = False if args['--verbose']: verbose = True exceptions = None output = None if args['--output']: output = args['--output'] if args['--except']: exceptions = args['--except'].split(',') samples = int(args['--samples']) # get numeric columns and drop class and exceptions (our features) features = df._get_numeric_data().columns.difference([class_attr]) if exceptions: if verbose: print('Removing the following: {}'.format(', '.join(exceptions))) features = features.difference(exceptions) # verbose detail if verbose: print('Using the following features: {}'.format(', '.join(features))) # create tree dt = DecisionTreeClassifier( min_samples_split=samples, criterion='entropy', splitter='best', random_state=99) y = df[class_attr] X = df[features] dt.fit(X, y) # export graph of tree if graph output specified if output: import subprocess if verbose: print('Saving as {0}.dot and {0}.png'.format(output)) # save dot with open(output + '.dot', 'w') as f: export_graphviz(dt, out_file=f, feature_names=features) command = 'dot -Tpng {0}.dot -o {0}.png'.format(output).split() try: subprocess.check_call(command) except: print('Problem creating graphviz image, is graphviz installed?') sys.exit(1)
def main(): #pre-processing 预处理 from sklearn.datasets import load_iris #导入IRIS数据集 iris=load_iris() #特征矩阵 # print(iris) print(len(iris['data'])) from sklearn.cross_validation import train_test_split #数据集大小为150*0.2=30,随机状态为1说明随机选取 train_data,test_data,train_target,test_target=train_test_split(iris.data,iris.target,test_size=0.2,random_state=1) #Model from sklearn import tree clf=tree.DecisionTreeClassifier(criterion="entropy") clf.fit(train_data,train_target) y_pred = clf.predict(test_data) #Veriify准确率和混淆矩阵 from sklearn import metrics print(metrics.accuracy_score(y_true=test_target,y_pred=y_pred)) #横轴表示实际值,纵轴表示预测值 print(metrics.confusion_matrix(y_true=test_target,y_pred=y_pred)) #矩阵中的1实际值是第二类,预测值是第三类 with open('./data/tree.dot','w') as fw: tree.export_graphviz(clf,out_file=fw)
def drawDecisionTree(dt, filename, featureNames, classNames): dot_data = StringIO() print featureNames print classNames tree.export_graphviz(dt, out_file=dot_data, feature_names=featureNames, class_names=classNames, rounded=True, special_characters=True, filled=True) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_png(filename)
def create_tree(X, Y): clf = tree.DecisionTreeClassifier(criterion="entropy") clf = clf.fit(X, Y) from IPython.display import Image import pydotplus dot_data = StringIO() # tree.export_graphviz(clf, out_file=dot_data) # feature_names = ['Gender', 'Age'] feature_names = ["Gender", "0-5", "6-12", "13-19", "20-27", "28-35", "36-50", "55+"] target_names = [] for i in range(1, len(Y) + 1): target_names.append("Ad #" + str(i)) tree.export_graphviz( clf, out_file=dot_data, feature_names=feature_names, class_names=target_names, filled=True, rounded=True, special_characters=True, ) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("Tree.pdf") return clf
def main(): data = run_game() clf = DecisionTreeClassifier(criterion='entropy') game_data = [[i[0], i[1]] for i in data] profits = [i[2] for i in data] clf.fit(game_data, profits) with open('tree.dot', 'w') as dotfile: export_graphviz( clf, dotfile, feature_names=['coin', 'bet'] ) predictions_lose1 = [clf.predict([0, 0]) for x in xrange(100)] predictions_lose2 = [clf.predict([0, 1]) for x in xrange(100)] predictions_win = [clf.predict([1, 1]) for x in xrange(100)] print 'All these profit predictions should be zero:' print predictions_lose1 print 'Accuracy was', calculate_accuracy(predictions_lose1, np.array([0])) print 'All these profit predictions should be zero:' print predictions_lose2 print 'Accuracy was', calculate_accuracy(predictions_lose2, np.array([0])) print 'All these profit predictions should be two:' print predictions_win print 'Accuracy was', calculate_accuracy(predictions_win, np.array([2]))
def tree(labels,X,df,i): tree = DT(max_depth = 4) tree.fit(X,labels) impt = tree.feature_importances_ para = tree.get_params() export_graphviz(tree, out_file = OUTPUT_DIRECTORY+str(i)+"_tree.dot", feature_names = df.columns) return impt
def draw_tree(clf): import pydot import StringIO output = StringIO.StringIO() tree.export_graphviz(clf, out_file=output) graph = pydot.graph_from_dot_data(output.getvalue()) graph.write_pdf('tree.pdf')
def visualize_tree(dt, fv_columns, exclude_attrs, create_file=True): """Create tree png using graphviz. """ if isinstance(dt, DTMatcher): tree = dt.clf else: tree = dt if exclude_attrs is None: feature_names = fv_columns else: cols = [c not in exclude_attrs for c in fv_columns] feature_names = fv_columns[cols] with open("dt_.dot", 'w') as f: export_graphviz(tree, out_file=f, feature_names=feature_names) command = ["dot", "-Tpng", "dt_.dot", "-o", "dt_.png"] try: subprocess.check_call(command) except: logger.error("Could not run dot, ie graphviz, to " "produce visualization") return print("Execute the following command in IPython command prompt:") print("") print("from IPython.display import Image") print("Image(filename='dt_.png') ")
def train_network(self): """ Pure virtual method for training the network """ db_query = self._database_session.query(PregameHitterGameEntry) mlb_training_data, mlb_evaluation_data = self.get_train_eval_data(db_query, 0.8) X_train, Y_train = self.get_stochastic_batch(mlb_training_data, self.SIZE_TRAINING_BATCH) self._decision_tree.fit(X_train, Y_train) dot_data = StringIO() tree.export_graphviz(self._decision_tree, out_file=dot_data, feature_names=PregameHitterGameEntry.get_input_vector_labels()) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("hitter_tree.pdf") x_test_actual = list() y_test_actual = list() for data in mlb_evaluation_data: try: postgame_entry = self._database_session.query(PostgameHitterGameEntry).filter(PostgameHitterGameEntry.rotowire_id == data.rotowire_id, PostgameHitterGameEntry.game_date == data.game_date).one() y_test_actual.append([postgame_entry.actual_draftkings_points]) x_test_actual.append(data.to_input_vector()) except NoResultFound: print "Ignoring hitter %s since his postgame stats were not found." % data.rotowire_id continue self._database_session.close()
def learn_dtree(data, csvfile): clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=4) k = data.groupby(['operator']) # k = data.groupby(['operator']) f = k['isCovered'].agg({'mean_kill': np.mean, 'number of mutants': len, 'number of killed':np.sum}) f.to_csv(csvfile) fig = plt.figure() # ax = Axes3D(fig) # ax = fig.add_subplot(111, projection='3d') plt.scatter(standardize (f['mean']),f['sum']) plt.ylabel('mutant_size') plt.xlabel('expected_kill (standatdize)') # print f[f['len'] > 25000] # ax.set_xlabel('mean') # ax.set_ylabel('len') # ax.set_zlabel('sum') plt.show() # plt.show() # for m in k.groups: # print m,len(k.groups[m]), data['op'] = pd.factorize(data['operator'])[0] data['m'] = pd.factorize(data['method'])[0] HLdata['c'] = pd.factorize(data['class'])[0] # plt.show() plt.close() x = data[['op', 'c', 'testId']].values y = data['isCovered'].values clf.fit(x,y) dot_data = StringIO.StringIO() tree.export_graphviz(clf, out_file=dot_data) return dot_data.getvalue()
def visualize_tree(dtree): dot_data = StringIO() tree.export_graphviz(dtree, out_file=dot_data, filled=True, rounded=True, special_characters=True) graph = pydot.graph_from_dot_data(dot_data.getvalue()) display(Image(graph.create_png()))
def generate_plot(clf): print "\nGenerating plot..." dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("weather_forecast.pdf") print "Plot generated!"
def tree2(): global final_html global df,origin_df chi_key = list() firstkey = "" init_style_string = """<p style="position: absolute; font-size: 12px; top: <top>px; width: <width>px; height: <height>px; left:<left>px; text-align: center;">tree_text_here</p>""" if request.method == 'POST': Listkey1 = list(MultiDict(request.form).values()) Listkey2 = MultiDict(request.form) DV_tree = Listkey2.get('DV') df1 = df for key1 in Listkey1: if(key1 <> "Build Tree" and key1 <> DV_tree): chi_key.append(key1) df1 = df.loc[:,chi_key] df2 = df1.values temp_count = 0 Y = df[DV_tree] clf = tree.DecisionTreeClassifier() clf = clf.fit(df2,Y.values) dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) k = dot_data.getvalue() k1 = k.split(";") left_px = 600 width_px = 150 top_px = 50 height_px = 309 s = build_tree_html(k,init_style_string,left_px,width_px,top_px,height_px) temp_df = df[0:15] t = """</div><div style="float:right;"><br> Decision Tree result <br>""" final_html = template.s1 + t + k + "</div><br><br><br>" + temp_df.to_html() return final_html return 'helloo'
def set_yaml_infos(self, data): if self.logging >= 3: print("data: %s"%data) if self.logging >= 2: print("data.task_name: %s"%data.task_name) objects = data.objects task_name = data.task_name self.original_objects = objects all_data = {} if self.logging >= 1: print(">>>> Receiving Objects from YAML") if self.logging >= 2: print("Objects Received from YAML: \r\n %s"%objects) for object in objects: name = object.name color = object.color primitive = object.primitives cub_res = self.get_surrounding_cuboid(object) self.cubeized_objects[name] = {'color': color, 'dimensions': cub_res} randomized_objects = self.randomize_objects(self.cubeized_objects, self.number_of_data, self.size_tresh_perc, self.color_tresh) all_data[name] = randomized_objects self.max_height = max(self.max_height, max(cub_res)) data, labels = self.convert_to_dataset(all_data) if "task 3" in task_name: rnd_data, rnd_labels = self.create_random_obstacles(number=1000) else: rnd_data, rnd_labels = [],[] if self.logging >= 3: print("All Data + Labels \r\n %s \r\n %s" %( (labels + rnd_labels), (data+rnd_data))) self.clf.fit(data+rnd_data, labels+rnd_labels) if self.logging >=3: tree.export_graphviz(self.clf, out_file='tree.dot', feature_names=['h', 's', 'v', 'site']) print("max height: %s\n"%self.max_height)
def classifyTree(Xtr, ytr, Xte, yte, splitCriterion="gini", maxDepth=0, visualizeTree=False): """ Classifies data using CART """ try: accuracyRate, probabilities, timing = 0.0, [], 0.0 # Perform classification cartClassifier = tree.DecisionTreeClassifier(criterion=splitCriterion, max_depth=maxDepth) startTime = time.time() prettyPrint("Training a CART tree for classification using \"%s\" and maximum depth of %s" % (splitCriterion, maxDepth), "debug") cartClassifier.fit(numpy.array(Xtr), numpy.array(ytr)) prettyPrint("Submitting the test samples", "debug") predicted = cartClassifier.predict(Xte) endTime = time.time() # Compare the predicted and ground truth and append result to list accuracyRate = round(metrics.accuracy_score(predicted, yte), 2) # Also append the probability estimates probs = cartClassifier.predict_proba(Xte) probabilities.append(probs) timing = endTime-startTime # Keep track of performance if visualizeTree: # Visualize the tree dot_data = StringIO() tree.export_graphviz(cartClassifier, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) prettyPrint("Saving learned CART to \"tritonTree_%s.pdf\"" % getTimestamp(), "debug") graph.write_pdf("tree_%s.pdf" % getTimestamp()) except Exception as e: prettyPrint("Error encountered in \"classifyTree\": %s" % e, "error") return accuracyRate, timing, probabilities, predicted
def classfyWithScipy(dataSet,labels,dataToClassfy): clf = tree.DecisionTreeClassifier(criterion="entropy").fit(dataSet,labels) dot_data = StringIO.StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("entropy.pdf") return clf.predict(dataToClassfy)
def build_tree_image(model,X): dotfile = open("tree.dot", 'w') export_graphviz(model, out_file = dotfile, feature_names = X.columns) dotfile.close() system("dot -Tpng tree.dot -o tree.png")
def main(): if (len(sys.argv) < 2): print("One Argument Required; Training Set") return X_train, Y_train = ParseTraining(sys.argv[1]) #X_test, Y_test = ParseTraining(sys.argv[2]) #X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.2, random_state=99) #X_train, X_test, Y_train, Y_test = X, X, Y, Y #clf = tree.DecisionTreeClassifier() clf = tree.DecisionTreeClassifier(max_depth=6) #clf = OneVsRestClassifier(SVC(kernel="linear", C=0.025)) #clf = RandomForestClassifier(max_depth=6, n_estimators=10, max_features=1) #clf = SVC(kernel="linear", C=0.025) #clf = AdaBoostClassifier() #clf = SVC(gamma=2, C=1) clf = clf.fit(X_train, Y_train) #feature_names = ["partAvg", "recavg", "latency", "ReadRate"] feature_names = ["partConf", "recAvg", "latency", "ReadRate", "homeconf"] #feature_names = ["partAvg", "recAvg", "recVar", "ReadRate"] #feature_names = ["partAvg", "recAvg", "recVar"] #feature_names = ["recAvg", "recVar", "Read"] #feature_names = ["partAvg", "recVar"] ##class_names = ["Partition", "OCC", "2PL"] #class_names = ["OCC", "2PL"] class_names = ["Partition", "No Partition"] dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data, feature_names=feature_names, class_names=class_names, filled=True, rounded=True, special_characters=True) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_png("partition.png")
def mainTree(): header=re.sub(' |\t','','id|gender|age|height|edu|salary|nation|car|house|body|face|hair|\ smoke|drink|child|parent|bmi|where0|where1|\ marriage0|marriage1|look0|look1|where2').split('|') MaleData=pd.read_csv('/home/idanan/jiayuan/code/resources/transed_M.txt',names=header,sep='|') FemaleData=pd.read_csv('/home/idanan/jiayuan/code/resources/cluster_female.txt',names=header+['class'],sep='|') matches=matchDict('/home/idanan/jiayuan/code/resources/lovers_ids.txt') FemaleData['id']=FemaleData['id'].map(partial(match,matches=matches)) FemaleClass=FemaleData[['id','class']] newMaleData=concatData(MaleData,FemaleClass) MaleArrays=scaleData(newMaleData,['id','gender']) pca=factors(MaleArrays[:,:-1],17) print 'PCA explained variance:', sum(pca.explained_variance_ratio_) pcaMaleArray=pca.transform(MaleArrays[:,:-1]) MaleArrays=np.c_[pcaMaleArray,MaleArrays] trainData,testData=departData(MaleArrays,0.9) trainModel=decisionModel(trainData) dot_data = StringIO() tree.export_graphviz(trainModel, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("/home/idanan/jiayuan/code/resources/marriage.pdf") rate=test(trainModel,testData) print 'Decision Model true rate',rate
def drawDecTree(decTree, X, y, outdir, label=randint(100), featNames=None): decTree.fit(X, y) #return decTree.feature_importances_ dot_data = StringIO.StringIO() tree.export_graphviz(decTree, out_file=dot_data, feature_names=featNames) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_png(outdir + "/" + str(label) + "_graph" + ".png")
def create_pdf(clf): print 'Drawing tree...' """Save dec tree graph as pdf.""" dot_data = StringIO.StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf('NvD5.pdf')
def fit_decision_tree(train_X, train_y, test_X, test_y): # print classification reports # print accuracy # The format should be """ Classification Report: precision recall f1-score support 0.0 0.80 0.89 0.85 4932 1.0 0.75 0.60 0.67 2676 avg / total 0.78 0.79 0.78 7608 Accuracy: 0.788512092534""" dtc = tree.DecisionTreeClassifier() dtc = dtc.fit(train_X,train_y.flat) pred_y = dtc.predict(test_X) print classification_report(test_y, pred_y) print accuracy_score(test_y,pred_y) # create the graph - Here you just need to create the dot file. Please uncomment my code below from sklearn.externals.six import StringIO f = open('tre.dot','w') tree.export_graphviz(dtc, out_file=f) # please change your_tree_model_fit with the variable you used above f.close()
def dt_graph(treeest, cv, scores, features, labels, featnames, outfile): ''' Retrains the tree estimator using the fold with the best results from the cross-validation process. Prints out a graph pdf file of that estimator.''' # Hacky way to get the training data for the best fold bestfold = np.argmax(scores) cnt = 0 for train, _ in cv: # Only do stuff when you've got the training indices for the best fold if(cnt == bestfold): # Fit treeest.fit(features[train], labels[train]) # Get the dot file dot_data = StringIO() tree.export_graphviz(treeest, out_file=dot_data, \ feature_names=featnames) # Convert the dot file to a graph graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf(outfile) return else: cnt += 1 print("You should never see this text from dt_graph!") return
def main(features_fpath, classes_fpath): with open(features_fpath) as features_file: for line in features_file: if '#' in line: spl = line.split() names = spl[1:] X = scale(np.genfromtxt(features_fpath)[:,1:].copy()) y = np.loadtxt(classes_fpath) forest = ExtraTreesClassifier(max_depth=4, criterion="entropy", compute_importances=True) scores = cross_val_score(forest, X, y, score_func=f1_score, cv=5) print(scores) forest.fit(X, y) importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") for f in xrange(len(importances[indices])): print("%d. feature %s (%f)" % (f + 1, names[indices[f]], importances[indices[f]])) export_graphviz(forest, 'bala.dot')
def make_tree_test(): from sklearn import tree import StringIO import pydot from IPython.display import display, Image x,y,dates,movies = load_data() #x = add_missed_value_indicator(x) test_x, train_x, test_y, train_y = create_test_train_set(x, y) clf = tree.DecisionTreeClassifier(min_samples_split=3000) fit = clf.fit(train_x,train_y) dot_data = StringIO.StringIO() tree.export_graphviz(fit, feature_names=train_x.columns, class_names=["1","2","3","4","5"], out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph[0].write_png("tree_toy.png") img = Image(graph[0].create_png()) display(img) return fit
def arbolesRegresion(caract): clf = DecisionTreeRegressor(min_samples_leaf=10, min_samples_split=15, max_depth=13, compute_importances=True) importancias = [0,0,0,0,0,0,0,0,0,0,0,0,0] mae=mse=r2=0 kf = KFold(len(boston_Y), n_folds=10, indices=True) for train, test in kf: trainX, testX, trainY, testY=boston_X[train], boston_X[test], boston_Y[train], boston_Y[test] nCar=len(caract) train=np.zeros((len(trainX), nCar)) test=np.zeros((len(testX), nCar)) trainYNuevo=trainY for i in range(nCar): for j in range(len(trainX)): train[j][i]=trainX[j][caract[i]] for k in range(len(testX)): test[k][i]=testX[k][caract[i]] trainYNuevo=np.reshape(trainYNuevo, (len(trainY), -1)) clf.fit(train, trainYNuevo) prediccion=clf.predict(test) # clf.fit(trainX, trainY) # prediccion=clf.predict(testX) mae+=metrics.mean_absolute_error(testY, prediccion) mse+=metrics.mean_squared_error(testY, prediccion) r2+=metrics.r2_score(testY, prediccion) feature_importance = clf.feature_importances_ feature_importance = 100.0 * (feature_importance / feature_importance.max()) for i in range(13): importancias[i] = importancias[i] + feature_importance[i] print 'Error abs: ', mae/len(kf), 'Error cuadratico: ', mse/len(kf), 'R cuadrado: ', r2/len(kf) for i in range(13): importancias[i] = importancias[i]/10 sorted_idx = np.argsort(importancias) pos = np.arange(sorted_idx.shape[0]) + .5 importancias = np.reshape(importancias, (len(importancias), -1)) boston = datasets.load_boston() pl.barh(pos, importancias[sorted_idx], align='center') pl.yticks(pos, boston.feature_names[sorted_idx]) pl.xlabel('Importancia relativa') pl.show() import StringIO, pydot dot_data = StringIO.StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("bostonTree.pdf")
def tree3(): global final_html global df,df_train,df_test,test_train_created,origin_df chi_key = list() init_style_string = template.style_string if request.method == 'POST': Listkey1 = list(MultiDict(request.form).values()) Listkey2 = MultiDict(request.form) DV_tree = Listkey2.get('DV') df1 = df for key1 in Listkey1: if(key1 <> "Build Tree" and key1 <> DV_tree): chi_key.append(key1) df1 = df.loc[:,chi_key] df2 = df1.values Y = df[DV_tree] clf = tree.DecisionTreeClassifier() clf = clf.fit(df2,Y.values) dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) k = dot_data.getvalue() left_px = 600 width_px = 150 top_px = 50 height_px = 309 s = build_tree_html(k,init_style_string,left_px,width_px,top_px,height_px) temp_df = df[0:15] t = """</div><div style="width:600px; height:700px; position: absolute; top: 20px; left:500px;"><br> Decision Tree result <br>""" final_html = template.s1 + t + k + "<br><br></div>" + temp_df.to_html() return final_html return 'helloo'
# # TODO: Create an DT classifier. No need to set any parameters # from sklearn import tree dtc = tree.DecisionTreeClassifier() # # TODO: train the classifier on the training data / labels: # TODO: score the classifier on the testing data / labels: # dtc.fit(X_train, y_train) score = dtc.score(X_test, y_test) print "High-Dimensionality Score: ", round((score * 100), 3) # # TODO: Use the code on the course's SciKit-Learn page to output a .DOT file # Then render the .DOT to .PNGs. Ensure you have graphviz installed. # If not, `brew install graphviz`. If you can't, use: http://webgraphviz.com/. # On Windows 10, graphviz installs via a msi installer that you can download from # the graphviz website. Also, a graph editor, gvedit.exe can be used to view the # tree directly from the exported tree.dot file without having to issue a call. # tree.export_graphviz(dtc.tree_, out_file='tree.dot', feature_names=X.columns) from subprocess import call call(['dot', '-T', 'png', 'tree.dot', '-o', 'tree.png'])
print(dt_grid_estimator.best_score_) final_estimator = dt_grid_estimator.best_estimator_ results = dt_grid_estimator.cv_results_ print(results.get("mean_test_score")) print(results.get("mean_train_score")) print(results.get("params")) #get the logic or model learned by Algorithm #issue: not readable print(final_estimator.tree_) #get the readable tree structure from tree_ object #visualize the deciion tree dot_data = io.StringIO() tree.export_graphviz(final_estimator, out_file=dot_data, feature_names=X_train.columns) graph = pydot.graph_from_dot_data(dot_data.getvalue())[0] graph.write_pdf("C:/Users/Algorithmica/Downloads/tree.pdf") #read test data titanic_test = pd.read_csv( "C:\\Users\\Algorithmica\\Downloads\\titanic_test.csv") print(titanic_test.info()) titanic_test[imputable_cont_features] = cont_imputer.transform( titanic_test[imputable_cont_features]) titanic_test['Embarked'] = cat_imputer.transform(titanic_test['Embarked']) titanic_test['Embarked'] = le_embarked.transform(titanic_test['Embarked']) titanic_test['Sex'] = le_sex.transform(titanic_test['Sex'])
df2, targets, job_num = tar_encode(df, "y") ###Drop the categorical columns df2.drop(['y'], axis=1, inplace=True) df2.drop(['job'], axis=1, inplace=True) df2.drop(['marital'], axis=1, inplace=True) df2.drop(['education'], axis=1, inplace=True) df2.drop(['default'], axis=1, inplace=True) df2.drop(['housing'], axis=1, inplace=True) df2.drop(['loan'], axis=1, inplace=True) df2.drop(['contact'], axis=1, inplace=True) df2.drop(['month'], axis=1, inplace=True) df2.drop(['day_of_week'], axis=1, inplace=True) df2.drop(['poutcome'], axis=1, inplace=True) ##correlation print df2.corr() features = list(df2.columns[0:20]) y = df2["target"] X = df2[features] dt = DecisionTreeClassifier(max_depth=4) dt.fit(X, y) tree.export_graphviz(dt, out_file='C:/MIS680/tree.dot', feature_names=X.columns) (graph, ) = pydot.graph_from_dot_file('C:/MIS680/tree.dot') graph.write_png('C:/MIS680/tree.png')
# export model params Estimators = clf.estimators_ Importances = clf.feature_importances_ numberClasses = clf.n_classes_ numberInputs = len(clf.feature_importances_) numberTrees = len(clf.estimators_) fo = open("RandomForestModel.txt", "w") fo.write("RandomForestClassifier\n") fo.write("IrisRandomForestModel\n") fo.write("classification\n") fo.write("binarySplit\n") fo.write(str(numberInputs) + "\n") for num in range(0, numberInputs): fo.write(shuxingname[num] + ", double,continuous,NA,NA,asMissing\n") print(shuxingname[num]) fo.write(str(numberClasses) + "\n") for num in range(0, numberClasses): fo.write(classname[num] + "\n") print(classname[num]) fo.write(str(numberTrees) + "\n") fo.close() for num in range(0, numberTrees): fileName = "irsRF_" + str(num) + ".dot" with open(fileName, 'w') as f: f = tree.export_graphviz(Estimators[num], out_file=f)
for each_label in lensesLabels: # 提取信息,生成字典 for each in lenses: lenses_list.append(each[lensesLabels.index(each_label)]) lenses_dict[each_label] = lenses_list lenses_list = [] # print(lenses_dict) #打印字典信息 lenses_pd = pd.DataFrame(lenses_dict) # 生成pandas.DataFrame # print(lenses_pd) #打印pandas.DataFrame le = LabelEncoder() # 创建LabelEncoder()对象,用于序列化 for col in lenses_pd.columns: # 序列化 lenses_pd[col] = le.fit_transform(lenses_pd[col]) # print(lenses_pd) #打印编码信息 clf = tree.DecisionTreeClassifier( max_depth=4) # 创建DecisionTreeClassifier()类 clf = clf.fit(lenses_pd.values.tolist(), lenses_target) # 使用数据,构建决策树 dot_data = StringIO() tree.export_graphviz( clf, out_file=dot_data, # 绘制决策树 feature_names=lenses_pd.keys(), class_names=clf.classes_, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("tree.pdf") # 保存绘制好的决策树,以PDF的形式存储。 print(clf.predict([[1, 1, 1, 0]])) # 预测
# -*- coding:utf-8 -*- # 使用ID3算法进行分类 import pandas as pd from sklearn.tree import DecisionTreeClassifier as DTC, export_graphviz data = pd.read_csv('../data/titanic_data.csv', encoding='utf-8') data.drop(['PassengerId'], axis=1, inplace=True) # 舍弃ID列,不适合作为特征 # 数据是类别标签,将其转换为数,用1表示男,0表示女。 data.loc[data['Sex'] == 'male', 'Sex'] = 1 data.loc[data['Sex'] == 'female', 'Sex'] = 0 data.fillna(int(data.Age.mean()), inplace=True) print(data.head(5)) # 查看数据 X = data.iloc[:, 1:3] # 为便于展示,未考虑年龄(最后一列) y = data.iloc[:, 0] dtc = DTC(criterion='entropy') # 初始化决策树对象,基于信息熵 dtc.fit(X, y) # 训练模型 print('输出准确率:', dtc.score(X, y)) # 可视化决策树,导出结果是一个dot文件,需要安装Graphviz才能转换为.pdf或.png格式 with open('../tmp/tree.dot', 'w') as f: f = export_graphviz(dtc, feature_names=X.columns, out_file=f)
test_idx = [0,50,100] train_target = np.delete(iris.target,test_idx) train_data = np.delete(iris.data,test_idx,axis=0) print ("Data", train_data) print ("Teste", test_idx) test_target = iris.target[test_idx] test_data = iris.data[test_idx] clf = tree.DecisionTreeClassifier() clf.fit(train_data,train_target) print ("Resposta:",clf.predict(test_data)) # print "Resposta:",clf.predict([[5.5,2.4,3.7,1.0],[5.8, 2.7 ,5.1, 1.9]]) #outra forma de criar o iris # dot_data = tree.export_graphviz(clf, out_file=None) # graph = graphviz.Source(dot_data) # graph.render("iris") dot_data = tree.export_graphviz(clf, out_file=None, feature_names=iris.feature_names, class_names=iris.target_names, filled=True, rounded=True, special_characters=True) graph = graphviz.Source(dot_data) graph.render('iris')
mpl.rc('ytick', labelsize=12) #%% decision trees from sklearn.datasets import load_iris from sklearn.tree import DecisionTreeClassifier, export_graphviz iris = load_iris() X = iris.data[:, 2:] # petal length and width y = iris.target tree_clf = DecisionTreeClassifier(max_depth=3, random_state=42) tree_clf.fit(X, y) export_graphviz( tree_clf, out_file="iris_tree.dot", feature_names=iris.feature_names[2:], class_names=iris.target_names, rounded=True, filled=True) #%% Plot decision boundaries from matplotlib.colors import ListedColormap def plot_decision_boundary(clf, X, y, axes=[0, 7.5, 0, 3], iris=True, legend=False, plot_training=True): x1s = np.linspace(axes[0], axes[1], 100) x2s = np.linspace(axes[2], axes[3], 100) x1, x2 = np.meshgrid(x1s, x2s) X_new = np.c_[x1.ravel(), x2.ravel()] y_pred = clf.predict(X_new).reshape(x1.shape) custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0']) plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap)
features = ['Age', 'Experience', 'Rank', 'Nationality'] X = df[features] y = df['Go'] print(X) print(y) Now we can create the actual decision tree, fit it with our details, and save a .png file on the computer: Example Create a Decision Tree, save it as an image, and show the image: dtree = DecisionTreeClassifier() dtree = dtree.fit(X, y) data = tree.export_graphviz(dtree, out_file=None, feature_names=features) graph = pydotplus.graph_from_dot_data(data) graph.write_png('mydecisiontree.png') img=pltimg.imread('mydecisiontree.png') imgplot = plt.imshow(img) plt.show() Result Explained The decision tree uses your earlier decisions to calculate the odds for you to wanting to go see a comedian or not. Let us read the different aspects of the decision tree: Rank
def ejecutarModeloyGuardarlo(nombreModelo, modelo, pathModelo, ds_train_f, ds_train_t, ds_test_f, ds_test_t, feature_names, modeloEsGrid, modoDebug, dir_subgrupo_img): print((datetime.datetime.now()).strftime("%Y%m%d_%H%M%S") + " Ejecutando " + nombreModelo + " ...") out_grid_best_params = [] param_parada_iteraciones = 10 # early_stopping_rounds: es el numero de iteraciones en las que ya no mejora el error diferencial train-test, evitando iterar tanto en XGBoost y reducir el overfitting eval_set = [(ds_train_f, ds_train_t), (ds_test_f, ds_test_t)] #-------- PINTAR EL ERROR DE OVERFITTING --------------------------- #-------------------URL: https://machinelearningmastery.com/avoid-overfitting-by-early-stopping-with-xgboost-in-python/ #--- URL: https://xgboost.readthedocs.io/en/latest/parameter.html METODO_EVALUACION="map" # map: Mean average Precision. aucpr: Area under the PR curve (peores resultados en precisión) # Explicacion: https://xgboost.readthedocs.io/en/latest/parameter.html # Con PARAMETROS PARA VER EL OVERFITTING modelo = modelo.fit(ds_train_f, ds_train_t, eval_metric=[METODO_EVALUACION], early_stopping_rounds=param_parada_iteraciones, eval_set=eval_set, verbose=False) # ENTRENAMIENTO (TRAIN) # --------------- Pintar dibujo--------------------------------------------------------------- y_pred = modelo.predict(ds_test_f) y_pred = y_pred.astype(float) predictions = [round(value) for value in y_pred] precision_para_medir_overfitting = precision_score(ds_test_t, predictions) print("Accuracy (PRECISION) para medir el overfitting: %.2f%%" % (precision_para_medir_overfitting * 100.0)) results = modelo.evals_result() epochs = len(results['validation_0'][METODO_EVALUACION]) x_axis = range(0, epochs) fig, ax = pyplot.subplots() ax.plot(x_axis, results['validation_0'][METODO_EVALUACION], label='Train') ax.plot(x_axis, results['validation_1'][METODO_EVALUACION], label='Test') ax.legend() pyplot.xlabel("Numero de epochs") pyplot.ylabel(METODO_EVALUACION) pyplot.title("Modelo: " + nombreModelo + " - Metodo de evaluacion: " + METODO_EVALUACION) path_img_metricas_modelo_ovft = dir_subgrupo_img + nombreModelo + "_" + METODO_EVALUACION + ".png" print("Pintando IMG de metricas del modelo overfitting (train vs test). Path: " + path_img_metricas_modelo_ovft) plt.savefig(path_img_metricas_modelo_ovft, bbox_inches='tight') plt.clf(); plt.cla(); plt.close(); # Limpiando dibujo #------------------------------------------------------------------------------ # print("Se guarda el modelo " + nombreModelo + " en: " + pathModelo) if modeloEsGrid: s = pickle.dump(modelo.best_estimator_, open(pathModelo, 'wb')) out_grid_best_params = modelo.best_params_ print("Modelo GRID tipo " + nombreModelo + " Los mejores parametros probados son: " + str(modelo.best_params_)) if modoDebug and nombreModelo == "rf_grid": feature_imp = pd.Series(modelo.best_estimator_.feature_importances_, index=feature_names).sort_values( ascending=False) print("Importancia de las features en el modelo " + nombreModelo + " ha sido:") print(feature_imp.to_string()) print("Generando dibujo de un árbol de decision (elegido al azar de los que haya)...") print(feature_names) print("Guardando dibujo DOT en: " + pathModelo + '.dot' + " Convertirlo ONLINE en: http://viz-js.com/") export_graphviz(modelo.best_estimator_.estimators_[1], out_file=pathModelo + '.dot', feature_names=feature_names, class_names=list('TARGET'), rounded=True, proportion=False, precision=2, filled=True) # Online Viewers: # http: // www.webgraphviz.com / # http: // sandbox.kidstrythisathome.com / erdos / # http: // viz - js.com / # Conversion local de DOT a PNG (en mi PC no consigo instalarlo): # call(['dot', '-Tpng', pathModelo + '.dot', '-o', pathModelo + '.png', '-Gdpi=600']) # Convert to png else: s = pickle.dump(modelo, open(pathModelo, 'wb')) return modelo
import numpy as np print("Passing: %d out %d (%.2f%%)" % (np.sum(cianjur_pass), len(cianjur_pass), 100 * float(np.sum(cianjur_pass)) / len(cianjur_pass))) #%% 5.fit a decision tree from sklearn import tree bogor = tree.DecisionTreeClassifier(criterion="entropy", max_depth=5) bogor = bogor.fit(cianjur_train_att, cianjur_train_pass) #%% 6.visualize tree import graphviz yogyakarta = tree.export_graphviz(bogor, out_file=None, label="all", impurity=False, proportion=True, feature_names=list(cianjur_train_att), class_names=["fail", "pass"], filled=True, rounded=True) malang = graphviz.Source(yogyakarta) malang #%% 7.save tree tree.export_graphviz(yogyakarta, out_file="student-performance.dot", label="all", impurity=False, proportion=True, feature_names=list(cianjur_train_att), class_names=["fail", "pass"],
overfit_values[f][d], ax=axs[i // 4, i % 4], title='Overfitting for max_depth = %d with %s criteria' % (d, f), xlabel='min_impurity_decrease', ylabel='accuracy', percentage=True) i += 1 i += 1 plt.suptitle('QOT Overfitting - Decision Trees') plt.savefig(subDir + 'QOT Overfitting - Decision Trees') dot_data = export_graphviz(best_tree, out_file=(subDir + 'QOT - ' + key + ' - dtree.dot'), filled=True, rounded=True, special_characters=True, class_names=['negative', 'positive']) # Convert to png call([ 'dot', '-Tpng', (subDir + 'QOT - ' + key + ' - dtree.dot'), '-o', (subDir + 'QOT Decision Trees - ' + key + ' - tree representation.png'), '-Gdpi=600' ]) prd_trn = best_tree.predict(trnX) prd_tst = best_tree.predict(tstX) ds.plot_evaluation_results(["negative", "positive"], trnY, prd_trn, tstY, prd_tst) plt.suptitle('QOT Decision Trees - ' + key +
def runDecisionTreeI(plotTree=False, trainSize=0.3, pruneTree=False, pruningThreshold=0, maxDepth=50): global train_sizes, accuracy_1_test, accuracy_1_train, total_nodes, mean_error_1 ## STORE CATEGORICAL COLUMNS cols_to_drop = [1, 2, 3] all_features = [] ## PRUNING FUNCTION def prune_index(inner_tree, index, threshold): global post_prune_count if inner_tree.value[index].min() < threshold: # turn node into a leaf by "unlinking" its children inner_tree.children_left[index] = TREE_LEAF inner_tree.children_right[index] = TREE_LEAF # if there are shildren, visit them as well if inner_tree.children_left[index] != TREE_LEAF: prune_index(inner_tree, inner_tree.children_left[index], threshold) prune_index(inner_tree, inner_tree.children_right[index], threshold) ## LOAD FEATURE NAMES FROM FILE features_file = open( '../datasets/network-intrusions/pcap-features-all.txt', "r") for x in features_file.readlines(): all_features.append(x.rstrip()) train_features = all_features[0:39] ## LOAD DATASET INTO DATAFRAME df = pd.read_csv(sys.argv[1], header=None) df.columns = all_features ## CHANGE CATEGORICAL DATA TO INTEGER TYPE LABELS USING OneHotEncoder le = preprocessing.LabelEncoder() cols_to_drop = [1, 2, 3] for x in cols_to_drop: le.fit(df.iloc[:, x]) df.iloc[:, x] = le.transform(df.iloc[:, x]) ## FEATURES AND LABEL SELECTION X = df.iloc[:, 0:39].values y = df.iloc[:, 42].values ## TRAIN/TEST SPLIT X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1 - trainSize), train_size=trainSize, shuffle=True) ## FIT DATA TO MODEL clf = DecisionTreeClassifier(criterion='entropy', max_depth=maxDepth) clf = clf.fit(X_train, y_train) ## PRUNE NODES THAT HAVE MINIMUM CLASS COUNT 500 if pruneTree == True: prune_index(clf.tree_, 0, pruningThreshold) ## COUNT NODES AFTER PRUNING count_prunes = 0 for x in range(len(clf.tree_.value)): if clf.tree_.value[x].min() < pruningThreshold: count_prunes = count_prunes + 1 print count_prunes ## PREDICT THE VALUES OF THE TESTING SET predictions = clf.predict(X_test) ## COUNT MISPREDICTIONS count = 0 for x, z in zip(y_test, predictions): if x != z: count = count + 1 print 'RESULTS FOR DATASET I' print '----------------------------------------' print 'Test Size:' print len(X_test) print 'Train Size:' print len(X_train) print 'Accuracy on Test Data:' print clf.score(X_test, y_test) print 'Accuracy on Train Data:' print clf.score(X_train, y_train) print 'Mis-Classified:' print str(count) + ' Out of ' + str(len(y_test)) print 'Number of Nodes:' print len(clf.tree_.value) if pruneTree == True: print 'Number of Nodes After Pruning:' print len(clf.tree_.value) - count_prunes print '----------------------------------------' ## DRAW A DECISION TREE GRAPH if plotTree == True: dot_data = tree.export_graphviz(clf, feature_names=train_features, class_names=['normal', 'attack'], filled=True, rounded=True, out_file=None) graph = graphviz.Source(dot_data) graph.render('test-v2') ## APPEND RESULTS TO GLOBALS train_sizes.append(trainSize) total_nodes.append(len(clf.tree_.value) - count_prunes) accuracy_1_test.append(clf.score(X_test, y_test)) accuracy_1_train.append(clf.score(X_train, y_train))
plt.figure(facecolor='w') plt.plot(depth, err_list, 'ro-', lw=3) plt.xlabel('决策树深度', fontsize=16) plt.ylabel('错误率', fontsize=16) plt.grid(True) plt.title('决策树深度太多,导致的过拟合问题', fontsize=18) plt.show() from skimage import io from IPython.display import Image import pydotplus dot_data = tree.export_graphviz( model, out_file=None, feature_names=['PCA1', 'PCA2'], class_names=['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data) Image(graph.create_png()) with open('iris.dot', 'w') as f: f = tree.export_graphviz(model, out_file=f) import pydotplus dot_data = tree.export_graphviz(model, out_file=None) graph = pydotplus.graph_from_dot_data(dot_data) graph.write_pdf('iris3.pdf')
def run_prediction(df, input_features=None, clf=None, num_iters=10): global FEATURES global RUN_CT global PREDICTIONS global METADATA global DCT_COUNT features = None print(input_features) if input_features is None or input_features == [ 'dummy1' ] or input_features == ['dummy2']: features = [ c for c in list(df.columns.values) if not c in PREDICTIONS + METADATA ] # features = FEATURES # print "FEATURES",FEATURES[92:] else: features = input_features le = preprocessing.LabelEncoder() # print("input features",features) # scaler = Normalizer() scaler = StandardScaler() results = [] df_copy = df.copy() userIDs = [1] if 'userID' in df_copy.columns.values: userIDs = set(df_copy['userID'].tolist()) if not PERSONALIZED: userIDs = [1] for facet in PREDICTIONS: # if facet not in df.columns.values: # continue # print(facet) for i in range(num_iters): print(DCT_COUNT, facet) # print(i) RUN_CT += 1 res = {'prediction': facet} # 0) Slice to only feature columns and test column # 0a) Convert data to np array # 1) Random permutation y_tests_all = [] y_preds_all = [] y_scores_all = [] for u in userIDs: data_okay = True res = {'prediction': facet} # print(u, userIDs) if input_features not in [ 'dummy1', 'dummy2' ] and facet != 'userID' and PERSONALIZED: df = df_copy[df_copy['userID'] == u] while True: try: (session_nums_train, session_nums_test) = train_test_split_list( list(set(df['session_num'].tolist())), 0.8) y = df[facet].as_matrix() le.fit(y) df_train = df[df['session_num'].isin( session_nums_train)] df_test = df[df['session_num'].isin(session_nums_test)] X_train = df_train[features].as_matrix() X_test = df_test[features].as_matrix() assert (len(df_train) + len(df_test)) == len(df.index) y_train = df_train[facet].tolist() y_train = le.transform(y_train) y_test = df_test[facet].tolist() y_test = le.transform(y_test) if len(set(y_train)) == 1 or len(set(y_test)) == 0: data_okay = False break scaler = StandardScaler() if input_features == ['dummy1']: lr_model = DummyClassifier( strategy='stratified', random_state=random.randint(1, 4294967294)) elif input_features == ['dummy2']: lr_model = DummyClassifier( strategy='most_frequent', random_state=random.randint(1, 4294967294)) else: X_tofit = df[features].as_matrix() y_tofit = df[facet].as_matrix() y_tofit = le.transform(y_tofit) scaler.fit(X_tofit) X_tofit = scaler.transform(X_tofit) imap = Isomap(n_components=10) pca = PCA(n_components=10) rfe = RFE( LinearSVC(random_state=random.randint( 1, 4294967294))) sfm = SelectFromModel( LinearSVC(random_state=random.randint( 1, 4294967294))) knn = KNeighborsClassifier(n_neighbors=10) gnb = GaussianNB() mlp = MLPClassifier(alpha=1) gsc = GridSearchCV(LinearSVC(), { 'dual': [True, False], 'C': [0.1, 1, 10] }) svm = SVC(probability=True, gamma=0.7, C=1, random_state=random.randint( 1, 4294967294)) dct = DecisionTreeClassifier(max_depth=8) rfc = RandomForestClassifier( random_state=random.randint(1, 4294967294)) ada = AdaBoostClassifier( random_state=random.randint(1, 4294967294)) qda = QuadraticDiscriminantAnalysis() anovakbest_filter = SelectKBest( f_classif, k=min([20, len(features)])) ovr = OneVsRestClassifier( LinearSVC(random_state=random.randint( 1, 4294967294))) ovo = OneVsOneClassifier( LinearSVC(random_state=random.randint( 1, 4294967294))) # ovr = OneVsRestClassifier(MLPClassifier(alpha=1)) # ovo = OneVsOneClassifier(MLPClassifier(alpha=1)) clf_map = { 'knn': knn, 'gnb': gnb, 'svm': svm, 'rfc': rfc, 'ada': ada, 'qda': qda, 'svc': svm, 'mlp': mlp, 'ovr': ovr, 'ovo': ovo, 'gsc': gsc, 'dct': dct } if clf is not None and clf in clf_map.keys(): # lr_model = clf_map[clf] classifier = clf_map[clf] lr_model = Pipeline([('anova', anovakbest_filter), ('clf', classifier)]) # if classifier==dct: # DOT_DATA = tree.export_graphviz(clf, out_file=None, # feature_names=input_features, # class_names=facet, # filled=True, rounded=True, # special_characters=True) # lr_model = Pipeline([('pca', pca), ('clf', clf_map[clf])]) # lr_model = Pipeline([('feature_selection', rfe), ('clf', clf_map[clf])]) # lr_model = Pipeline([('feature_selection', sfm), ('clf', clf_map[clf])]) else: lr_model = Pipeline([('anova', anovakbest_filter), ('clf', gnb)]) scaler.fit(X_train) lr_model.fit(scaler.transform(X_train), y_train) if clf == 'dct': DCT_COUNT += 1 export_graphviz( classifier, out_file='/Users/Matt/Desktop/output/out%d.dot' % DCT_COUNT, feature_names=features) break except ValueError as e: print(e) print("fail") pass # print("INPUT FEATURES", input_features,u,facet) if not data_okay: print("NOT OKAY!") continue X_test = scaler.transform(X_test) y_pred = lr_model.predict(X_test) def f(label, l): return sum(l == label) / len(l) # y_score = lr_model.predict_proba(X_test)[:,1] # y_score = lr_model.predict_proba(X_test)[:,1] y_tests_all += list(y_test) y_preds_all += list(y_pred) # y_scores_all += list(y_score) res["accuracy"] = metrics.accuracy_score(y_tests_all, y_preds_all) # res["f1"] = metrics.f1_score(y_test,y_pred,average='samples') # res["precision"] = metrics.precision_score(y_test,y_pred,average='samples') # res["recall"] = metrics.recall_score(y_test,y_pred,average='samples') # res["n_queries"] = df_test['queries_num'].tolist() # try: # res["aucroc"] = metrics.roc_auc_score(y_test,y_score) # except ValueError: # if sum(y_test) > 1: # res["aucroc"] = 1 # else: # res["aucroc"] = 0 # # res["ap"] = metrics.average_precision_score(y_test,y_score) res['y_true'] = le.inverse_transform(y_tests_all) res['y_pred'] = le.inverse_transform(y_preds_all) # res['y_true'] = y_test # res['y_pred'] = y_pred # res['y_score'] = y_scores_all res['run_ct'] = RUN_CT # print "SCORE",res["f1"] # print "SCORE",res["ap"],res["aucroc"] results += [res] return pd.DataFrame(results)
get_ipython().system('conda install -c conda-forge pydotplus -y') get_ipython().system('conda install -c conda-forge python-graphviz -y') from sklearn.externals.six import StringIO import pydotplus import matplotlib.image as mpimg from sklearn import tree get_ipython().run_line_magic('matplotlib', 'inline') dot_data = StringIO() filename = "loan.png" featureNames = df.columns[0:8] targetNames = df['loan_status'].unique().tolist() out = tree.export_graphviz(Tree, feature_names=featureNames, out_file=dot_data, class_names=np.unique(y_trainset), filled=True, special_characters=True, rotate=False) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_png(filename) img = mpimg.imread(filename) plt.figure(figsize=(100, 200)) plt.imshow(img, interpolation='nearest') # # Support Vector Machine # In[32]: df.dtypes df = df[pd.to_numeric(df['education'], errors='coerce').notnull()]
ha='right', fontsize=20) hm.xaxis.set_ticklabels(hm.xaxis.get_ticklabels(), rotation=0, ha='right', fontsize=20) plt.ylabel('True label', fontsize=20) plt.xlabel('Predicted label', fontsize=20) plt.title("Decision Tree - Entropy") plt.tight_layout() plt.show() # display decision tree dot_data = tree.export_graphviz(clf_gini, filled=True, rounded=True, class_names='survived', feature_names=tt.iloc[:, 0:].columns, out_file=None) graph = graph_from_dot_data(dot_data) graph.write_pdf("decision_tree_gini.pdf") webbrowser.open_new(r'decision_tree_gini.pdf') dot_data = tree.export_graphviz(clf_entropy, filled=True, rounded=True, class_names='survived', feature_names=tt.iloc[:, 0:].columns, out_file=None) graph = graph_from_dot_data(dot_data)
iris = load_iris() df = pd.DataFrame(data=np.c_[iris['data'], iris['target']], columns=iris['feature_names'] + ['target']) # df['label'] = df.target.replace(dict(enumerate(df.target_names))) print(df.head()) # to check the top results print(iris.feature_names) print(iris.target_names) print(df.describe()) # to check difference between min and maxmium value x = iris['data'] y = iris['target'] iris_df = pd.DataFrame(x, columns=iris['feature_names']) print(iris_df.head) x, y = shuffle(x, y, random_state=0) # random shuffle x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) classifier=DecisionTreeClassifier(criterion="entropy", max_depth=3) # To check accuracy ,applied algorithm clf = classifier.fit(x_train,y_train) y_pred = classifier.predict(x_test) print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) # accuracy result shoecase in console dot_data = StringIO() tree.export_graphviz(classifier, out_file=dot_data, feature_names=iris.feature_names, class_names=iris.target_names, filled=True, rounded=True, impurity=False, proportion=True) graph=pydot.graph_from_dot_data(dot_data.getvalue()) # plotting the graph graph[0].write_pdf("iris3.pdf") # run the file.
#Starting implementation correr en jupyter import pandas as pd import matplotlib.pyplot as plt import numpy as np import seaborn as sns #La siguiente instruccion permite incorporar las graficas en este documento %matplotlib inline from sklearn import tree df = pd.read_csv("iris_df.csv") df.columns = ["X1", "X2", "X3","X4", "Y"] df.head() #implementation from sklearn.cross_validation import train_test_split decision = tree.DecisionTreeClassifier(criterion="gini") X = df.values[:, 0:4] Y = df.values[:, 4] trainX, testX, trainY, testY = train_test_split( X, Y, test_size = 0.3) decision.fit(trainX, trainY) print("Accuracy: \n", decision.score(testX, testY)) #Visualisation from sklearn.externals.six import StringIO from IPython.display import Image import pydotplus as pydot dot_data = StringIO() tree.export_graphviz(decision, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) Image(graph.create_png())
plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') plt.savefig('tree_confusion_matrix.png',dpi=500,bbox_inches='tight') data=pd.read_csv('/content/drive/My Drive/ML/wine.csv') X=data[['0','1','2','3','4','5','6','7','8','9','10','11','12']].values Y=data['13'] Xtrain,Xtest,Ytrain,Ytest=train_test_split(X,Y,test_size=0.2,random_state=4) clf = DecisionTreeClassifier(criterion="entropy") clf = clf.fit(Xtrain,Ytrain) score = clf.score(Xtest,Ytest)#返回预测的准确accuracy print(score) feature_name = ['酒精','苹果酸','灰','灰的碱性','镁','总酚','类黄酮','非黄烷类酚类','花青素','颜色强度','色调','OD280/OD315稀释葡萄酒','脯氨酸'] dot_data = tree.export_graphviz(clf,out_file = None,feature_names= feature_name,class_names=["琴酒","雪莉","贝尔摩德"],filled=True,rounded=True) graph = graphviz.Source(dot_data) graph # graph.format = 'png' # graph.render("test",view=True) #graph.view() # system("dot -Tpng dtree2.png") clf = tree.DecisionTreeClassifier(criterion="entropy",random_state=30 ,splitter="random") clf = clf.fit(Xtrain, Ytrain) score = clf.score(Xtest, Ytest) score import graphviz dot_data = tree.export_graphviz(clf,feature_names= feature_name,class_names=["琴酒","雪莉","贝尔摩德"],filled=True,rounded=True ) graph = graphviz.Source(dot_data)
model = SVC() model.fit(train_x, train_y) predictions = model.predict(test_x) accuracy = accuracy_score(test_y, predictions) * 100 print("Accuracy %.2f%%"% accuracy) from sklearn.tree import DecisionTreeClassifier SEED = 50 np.random.seed(SEED) model = DecisionTreeClassifier(max_depth = 2) model.fit(raw_train_x, train_y) predictions = model.predict(raw_test_x) accuracy = accuracy_score(test_y, predictions) * 100 print("Accuracy %.2f%%"% accuracy) from sklearn.tree import export_graphviz import graphviz features = x.columns dot_data = export_graphviz(model, out_file=None, filled = True, rounded = True, feature_names = features, class_names=["no","yes"]) graph = graphviz.Source(dot_data) graph
criterion='gini') final.fit(X_train, Y_train) y_pred = final.predict(X_final) print('Accuracy:', metrics.accuracy_score(Y_final, y_pred)) #.74--- WOO estimator = final.estimators_[5] #Visualizing the features in my decision tree. One of the perks of a decision tree is its relative interpretability compared to other ML algorithms. from sklearn.tree import export_graphviz # Export as dot file export_graphviz(estimator, rounded=True, proportion=False, precision=2, filled=True) import matplotlib.pyplot as plt import numpy as np from sklearn import tree tree.plot_tree(classifier) #PCA from sklearn.preprocessing import MinMaxScaler min_max_scaler = MinMaxScaler() music_feature.loc[:] = min_max_scaler.fit_transform(music_feature.loc[:])
y = FeaturePicker.Category.to_frame() X_train, X_test, y_train, y_test = train_test_split(FeaturePicker.iloc[:, 1:], y, test_size=0.33, random_state=42) #Sklearn will generate a decision tree for your dataset using an optimized version of the CART algorithm when you run the following code from sklearn.tree import DecisionTreeClassifier dtree = DecisionTreeClassifier() dtree.fit(X_train, y_train) #metrics import sklearn.metrics as met y_pred = dtree.predict(X_test) print(met.classification_report(y_test, y_pred)) #Decode the arrays back into string #Graph the Tree from sklearn.externals.six import StringIO from IPython.display import Image from sklearn.tree import export_graphviz import pydotplus dot_data = StringIO() export_graphviz(dtree, out_file=dot_data, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) Image(graph.create_png())
# Calculate and display accuracy accuracy = 100 - np.mean(mape) print('Accuracy:', round(accuracy, 2), '%.') # Pull out one tree from the forest tree =model.estimators_[5] # Import tools needed for visualization # Pull out one tree from the forest tree = model.estimators_[5] # Saving feature names feature_list = list(train.columns) # Export the image to a dot file export_graphviz(tree, out_file = 'Big_tree.dot', feature_names = feature_list, rounded = True, precision = 1) # Use dot file to create a graph (graph, ) = pydot.graph_from_dot_file('Big_tree.dot') # Write graph to a png file graph.write_png('Big_tree.png') print(test) predicted_aqhi = model.predict(test) make_submission(predicted_aqhi,'Submission(RF)') print("\n\nThe forest include only 10 trees and each tree included only the 3 level\n") # Limit depth of tree to 3 levels import matplotlib.pyplot as plt _, ax = plt.subplots() ax.scatter(x = range(0, len(aqiArray1)), y=aqiArray1, c = 'blue', label = 'Actual', alpha = 0.3) ax.scatter(x = range(0, predicted_aqhi.size), y=predicted_aqhi, c = 'red', label = 'Predicted', alpha = 0.3)
return y_hat-y grad = loss_gradient(y,y_hat) #%% 6. Pseudo Residuals - basically the negative of the gradient pseudo_residuals = -loss_gradient(y,y_hat) #%% 7 train first tree from sklearn.tree import DecisionTreeRegressor regressor = DecisionTreeRegressor(max_depth = 1) #%% ft = regressor.fit(x,pseudo_residuals) print(ft) #%% from sklearn.tree import export_graphviz import graphviz tree = export_graphviz(regressor, impurity = False, filled = True) open("boston.jpg","w").write(tree) graph = graphviz.Source(tree) print(graph) #%% y1 = y_hat + regressor.predict(x) plt.figure(figsize = (12,8)) plt.plot(y) plt.plot(y_hat) plt.plot(y1) plt.legend(['real values','mean','newest prediction']) #%% check the new loss cl = compute_loss(y,y1).mean() #around 23 #%% Second iteration of the tree pseudo_residuals = -loss_gradient(y,y1) # not y_hat, but y1 regressor.fit(x,pseudo_residuals)
Y_test[i] = 1 #print ("Hello") elif (Y_test[i] == '-'): Y_test[i] = 0 #this part is for tra # for i in range(len(model_pred_a)): # if (model_pred_a[i]== '+') : # model_pred_a[i]=1 # #print ("Hello") # elif (model_pred_a[i]=='-'): # model_pred_a[i]=0 for i in range(len(model_pred)): if (model_pred[i] == '+'): model_pred[i] = 1 # print ("Hello") elif (model_pred[i] == '-'): model_pred[i] = 0 model_pred = model_pred.astype(np.int) #model_pred_a=model_pred_a.astype(np.int) #print(model_pred_a) Y_test = Y_test.astype(np.int) #model_pred = np.array(model_pred) accuracy = np.equal(model_pred, Y_test).sum() / len(Y_test) #this part is for tra #accuracy_a=np.equal(model_pred_a,Y_test_1).sum()/len(Y_test_1) print(accuracy) part2 = tree.export_graphviz(model, out_file='2e.dot') # print (accuracy)
d=pd.get_dummies(d,columns=['HC','DI','GA','RND']) print(d) d['RC']=d.apply(lambda row: 0 if (row['Risco']) == 'alto' else 1 if (row['Risco']) == 'moderado' else 2, axis=1) print() print(d.head()) print() #d['teste']=d.apply(lambda row:10 if (row['DI_baixa'])==1 and (row['RND_0 a 15'])==1 else 15,axis = 1) #print(d.head()) #print() d = d.sample(frac=1) d_train = d d_test = d d_train_att = d_train.drop(['RC'],axis=1) d_train_pass = d_train['RC'] from sklearn import tree t = tree.DecisionTreeClassifier(criterion="entropy") t = t.fit (d_train_att,d_train_pass) tree.export_graphviz(t, out_file="risco.dot",label="all",impurity=False,proportion=True, feature_names=list(d_train_att),class_names=['alto','moderado','baixo'], filled=True,rounded=True) t.predict([[0,1,0,0,1,0,1,0,0,1]])
# VISUALIZATION # if input("Make model tree? (y/N): ").lower() == 'y': print("\n\tSTARTING GRAPH") FEATURES = ['Average_Position_X', 'Average_Position_Y', 'Total_Distance', 'Average_Distance', 'Total_Duration', 'Average_Duration', 'Longest_Dist', 'WMax', 'EMax', 'NMax', 'SMax', 'Church-bin', 'NView-bin', 'Wallace-bin', 'Home-bin', '0-HourBin', '1-HourBin', '2-HourBin', '3-HourBin', '4-HourBin', '5-HourBin', '6-HourBin', '7-HourBin', '8-HourBin', '9-HourBin', '10-HourBin', '11-HourBin', '12-HourBin', '13-HourBin', '14-HourBin', '15-HourBin', '16-HourBin', '17-HourBin', '18-HourBin', '19-HourBin', '20-HourBin', '21-HourBin', '22-HourBin', '23-HourBin'] DAYS = ["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"] ABBREVIATED_DAYS = ["Sunday", "Mon-Thu", "Wednesday", "Friday", "Saturday"] dot_data = StringIO() tree.export_graphviz( clf, out_file=dot_data, filled=True, rounded=True, impurity=True, class_names=ABBREVIATED_DAYS, rotate=True, feature_names=FEATURES) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph = graph[0] graph.write_pdf("tree_model.pdf") print("\n\t~~~FINISHED GRAPH~~~") print("\tSaved as tree_model.pdf")
ns_pr_rf = len( labels_train_test[labels_train_test == 1]) / len(labels_train_test) plt.plot([0, 1], [ns_pr_rf, ns_pr_rf], linestyle='--') plt.plot(rf_recall, rf_precision, marker='.') plt.xlabel('Recall') plt.ylabel('Precision') plt.legend(['No Skill', 'Random Forest']) plt.savefig('rf_PR.png') plt.show() # 5.4 A decision tree tree = rf.estimators_[5] export_graphviz(tree, out_file='tree.dot', feature_names=features_list_train, rounded=True, precision=1) (graph, ) = pydot.graph_from_dot_file('tree.dot') graph.write_png('tree.png') # 5.5 A smaller tree rf_small = RandomForestClassifier(n_estimators=1660, max_depth=3) rf_small.fit(features_train_train, labels_train_train) tree_small = rf_small.estimators_[5] export_graphviz(tree_small, out_file='small_tree.dot', feature_names=features_list_train, rounded=True, precision=1) (graph, ) = pydot.graph_from_dot_file('small_tree.dot')
import numpy as np import pandas as pd import os from sklearn import tree from sklearn import preprocessing from IPython.display import Image mypath = 'C:\\Users\\ellen\\Desktop' os.chdir(mypath) train = pd.read_csv("106.csv") features = ["time", "water", "age"] trainer = pd.DataFrame([train["time"], train["water"], train["age"]]).T tree_model = tree.DecisionTreeClassifier(max_depth=3) tree_model.fit(X=trainer, y=train["survive"]) tree_model.score(X=trainer, y=train["survive"]) with open("tree3.dot", 'w') as f: f = tree.export_graphviz(tree_model, feature_names=features, out_file=f)