def decision_tree_various_depth(x_train, y_train, x_test, y_test): print('Decision Tree with depths 1-25 (inclusive)\n') # these will keep our points graphTrain = [] graphTest = [] graphF1 = [] # perform decision tree testing for each depth # i'd like to use the decision_tree_testing function here, but we need to set the proper depth for each iteration for layer in range(1, 26): print('Current depth: ', layer) clf = DecisionTreeClassifier(max_depth=layer) clf.fit(x_train, y_train) preds_train = clf.predict(x_train) preds_test = clf.predict(x_test) graphTrain.append(accuracy_score(preds_train, y_train)) graphTest.append(accuracy_score(preds_test, y_test)) print('Train {}'.format(accuracy_score(preds_train, y_train))) print('Test {}'.format(accuracy_score(preds_test, y_test))) preds = clf.predict(x_test) print('F1 Test {}\n'.format(f1(y_test, preds))) graphF1.append(f1(y_test, preds)) table = pd.DataFrame({ "Max Depth": [item for item in range(1, 26)], "Train Accuracy": graphTrain, "Test Accuracy": graphTest, "F1 Accuracy": graphF1 }) print(table) # plot our graph and output to a file plt.xlabel('Depth') plt.ylabel('Performance') plt.title('Accuracy & F1 Score vs Number of Trees') plt.plot('Max Depth', 'Train Accuracy', data=table, color='blue') plt.plot('Max Depth', 'Test Accuracy', data=table, color='green') plt.plot('Max Depth', 'F1 Accuracy', data=table, color='red') plt.legend() plt.savefig('q1.png') # get best depth in terms of validation accuracy topAccuracy = max(graphF1) print("The depth that gives the best validation accuracy is: ", [item for item in range(1, 26)][graphF1.index(topAccuracy)], "which has an F1 accuracy of ", topAccuracy) # get the most important feature for making a prediction clfMVP = DecisionTreeClassifier( max_depth=[item for item in range(1, 26)][graphF1.index(topAccuracy)]) clfMVP.fit(x_train, y_train) print("The most important feature for making a prediction is: ", clfMVP.root.feature) print("The threshold to split on for this feature is: ", clfMVP.root.split) # return the most important feature for use in main return clfMVP.root.feature
def decision_tree_testing(x_train, y_train, x_test, y_test): print('Decision Tree\n\n') clf = DecisionTreeClassifier(max_depth=20) clf.fit(x_train, y_train) preds_train = clf.predict(x_train) preds_test = clf.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = clf.predict(x_test) print('F1 Test {}'.format(f1(y_test, preds)))
def create_trees(x_train, y_train, x_test, y_test, maxdepth): #print('Decision Tree\n\n') clf = DecisionTreeClassifier(max_depth=maxdepth) clf.fit(x_train, y_train) preds_train = clf.predict(x_train) preds_test = clf.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) #print('Train {}'.format(train_accuracy)) #print('Test {}'.format(test_accuracy)) preds = clf.predict(x_test) #print('F1 Test {}'.format(f1(y_test, preds))) return (f1(y_test, preds)), train_accuracy, test_accuracy
def decision_tree_tune(x_train, y_train, x_test, y_test): print('Decision Tree tune\n\n') plotX = [i for i in range(1, 26)] plotTrain = [] plotTest = [] plotF1 = [] for depth in range(1, 26): print('Math Depth: ', depth) clf = DecisionTreeClassifier(max_depth=depth) clf.fit(x_train, y_train) preds_train = clf.predict(x_train) preds_test = clf.predict(x_test) train_accuracy = round(accuracy_score(preds_train, y_train), 3) test_accuracy = round(accuracy_score(preds_test, y_test), 3) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = clf.predict(x_test) F1 = round(f1(y_test, preds), 3) print('F1 Test {}'.format(F1)) print('\n') plotTrain.append(train_accuracy) plotTest.append(test_accuracy) plotF1.append(F1) df = pd.DataFrame({ "Max_Depth": plotX, "Train_Accuracy": plotTrain, "Test_Accuracy": plotTest, "F1_Accuracy": plotF1 }) print(df) maxAccuracy = max(plotF1) bestDepth = plotX[plotF1.index(maxAccuracy)] print("The best Depth is ", bestDepth, "with F1 accuracy ", maxAccuracy) print("Drawing plot") plt.plot('Max_Depth', 'Train_Accuracy', data=df, color='red') plt.plot('Max_Depth', 'Test_Accuracy', data=df, color='blue') plt.plot('Max_Depth', 'F1_Accuracy', data=df, color='black') plt.legend() plt.savefig('decision_tree_output.png') plt.close() return bestDepth
def decision_tree_testing(x_train, y_train, x_test, y_test, max_depth): print('Decision Tree') print("depth : %d" % max_depth) clf = DecisionTreeClassifier(max_depth) clf.fit(x_train, y_train) preds_train = clf.predict(x_train) preds_test = clf.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = clf.predict(x_test) preds_train =clf.predict(x_train) print('F1 Train {}'.format(f1(y_train, preds_train))) print('F1 Test {}\n'.format(f1(y_test, preds))) return train_accuracy, test_accuracy, f1(y_train, preds_train), f1(y_test, preds)
def decision_tree_testing_depth(x_train, y_train, x_test, y_test, min, max): print('#Decision Tree Depth Testing\n\n') accuracyTrain = np.zeros(max - min) accuracyTest = np.zeros(max - min) f1Train = np.zeros(max - min) f1Test = np.zeros(max - min) depths = np.arange(min, max) index = 0 for depth in depths: clf = DecisionTreeClassifier(max_depth=depth) clf.fit(x_train, y_train) preds_train = clf.predict(x_train) preds_test = clf.predict(x_test) accuracyTrain[index] = accuracy_score(preds_train, y_train) accuracyTest[index] = accuracy_score(preds_test, y_test) preds = clf.predict(x_test) f1Test[index] = calc_f1(preds_train, y_train) f1Train[index] = calc_f1(preds_test, y_test) index += 1 f1 = plt.figure(1) plt.plot(depths, accuracyTrain) plt.plot(depths, accuracyTest) plt.title("accuracy vs number of trees") plt.ylabel("Accuracy") plt.xlabel("Depth") plt.legend(['Training Accuracy', 'Testing Accuracy']) f1.show() f2 = plt.figure(2) plt.plot(depths, f1Train) plt.plot(depths, f1Test) plt.title("F1 vs number of trees") plt.ylabel("F1") plt.xlabel("Depth") plt.legend(['Training F1', 'Testing F1']) plt.show()
return train, dataset_copy # Split out training and test sets to use in model train, test = train_test(list_of_rows[1:]) # Instantiate manual classifier clf = DecisionTreeClassifier(max_depth=5, min_samples_split=4) # Fit / Create the decision tree tree = clf.fit(train) # Example of prediction generation predictions = [] for row in list_of_rows[1:]: prediction = clf.predict(tree, row) predictions.append(prediction) # Find accuracy of decision tree train & test data training_accuracy = clf.accuracy(tree, train) test_accuracy = clf.accuracy(tree, test) print(f"Manual Training Accuracy: {training_accuracy:.2%}") print(f"Manual Test Accuracy: {test_accuracy:.2%}") # ============================================================================= # Compare to actual function using pandas and sklearn # ============================================================================= df = pd.read_csv("iris.csv") train, test = train_test_split(df,
class TestCases(unittest.TestCase): ''' El següent conjunt de tests està basat en l'exercici 4 de problemes, en el que s'ha calculat un arbre de decisió (amb criteri de selecció ID3) a mà ''' def setUp(self): data = [['Si', 'No', 'No', 'No'], ['Si', 'No', 'Si', 'No'], ['No', 'No', 'No', 'Si'], ['No', 'No', 'Si', 'No'], ['No', 'Si', 'Si', 'Si']] self.df = pd.DataFrame( data, columns=['Operacio major', 'Familia', 'Gran', 'Enviar a casa']) self.tree = DecisionTreeClassifier(self.df.columns[:-1], [], Criterion.ID3) self.tree.set_attribute_values(self.df.to_numpy()[:, 0:3]) data_nan = [['Si', 'No', 'No', 'No'], ['?', 'No', 'Si', 'No'], ['No', 'No', 'No', 'Si'], ['No', 'No', '?', 'No'], ['No', 'Si', '?', 'Si']] self.df_nan = pd.DataFrame( data_nan, columns=['Operacio major', 'Familia', 'Gran', 'Enviar a casa']) '''Exemple vist a la diapositiva 76 de teoria (decision trees), forçem l'arbre que surt en la diapositiva, ja que sabem (manualment) que haurien de donar en aquest arbre les prediccioins dels atributs [?, c2], [?, ?] ''' data_2 = [['b1', 'c2', 'Yes'], ['b1', 'c2', 'Yes'], ['b1', 'c2', 'Yes'], ['b1', 'c2', 'Yes'], ['b2', 'c1', 'Yes'], ['b2', 'c1', 'Yes'], ['b2', 'c1', 'Yes'], ['b2', 'c2', 'No'], ['b2', 'c2', 'No'], ['b2', 'c2', 'No'], ['b2', 'c2', 'No'], ['b2', 'c2', 'No']] self.df2 = pd.DataFrame(data_2, columns=['A', 'B', 'Objectiu']) self.tree2 = DecisionTreeClassifier(self.df2.columns[:-1], [], Criterion.ID3) self.tree2.set_attribute_values(self.df2.to_numpy()[:, 0:2]) def test_entropy(self): s = self.df.values[:, 3] A = self.df.values[:, 0:3] entrpy = entropy(s) entrpy_cond = [] for a in A.T: entrpy_cond.append(entropy_cond(s, a)) expected_entrpy = -3 / 5 * log(3 / 5, 2) - 2 / 5 * log(2 / 5, 2) expected_entrpy_cond = [] expected_entrpy_cond.append( 3 / 5 * (-1 / 3 * log(1 / 3, 2) - 2 / 3 * log(2 / 3, 2))) #operacio major expected_entrpy_cond.append( 4 / 5 * (-3 / 4 * log(3 / 4, 2) - 1 / 4 * log(1 / 4, 2))) #familia expected_entrpy_cond.append( 2 / 5 * (-1 / 2 * log(1 / 2, 2) - 1 / 2 * log(1 / 2, 2)) + 3 / 5 * (-2 / 3 * log(2 / 3, 2) - 1 / 3 * log(1 / 3, 2))) # gran self.assertTrue(entrpy == expected_entrpy) for i in range(3): self.assertTrue(entrpy_cond[i] == expected_entrpy_cond[i]) def test_gini(self): s = self.df.values[:, 3] A = self.df.values[:, 0:3] gin = gini(s) gin_gain = [] for a in A.T: gin_gain.append(gini_gain(s, a)) expected_gini = 1 - 3 / 5 * 3 / 5 - 2 / 5 * 2 / 5 expected_gini_gain = [] expected_gini_gain.append( expected_gini - 3 / 5 * (1 - 1 / 3 * 1 / 3 - 2 / 3 * 2 / 3)) #operacio major expected_gini_gain.append( expected_gini - 4 / 5 * (1 - 3 / 4 * 3 / 4 - 1 / 4 * 1 / 4)) #familia expected_gini_gain.append(expected_gini - 2 / 5 * (1 - 1 / 2 * 1 / 2 - 1 / 2 * 1 / 2) - 3 / 5 * (1 - 2 / 3 * 2 / 3 - 1 / 3 * 1 / 3)) # gran self.assertTrue(gin == expected_gini) for i in range(3): self.assertTrue(gin_gain[i] == expected_gini_gain[i]) def test_tree(self): self.tree.fit(self.df.values[:, 0:3], self.df.values[:, 3]) node0 = self.tree.model self.assertTrue(type(node0) == SubTree) node1 = node0.child_nodes['No'] node2 = node0.child_nodes['Si'] self.assertTrue(type(node1) == SubTree) node3 = node1.child_nodes['No'] node4 = node1.child_nodes['Si'] self.assertTrue(type(node3) == SubTree) node5 = node3.child_nodes['No'] node6 = node3.child_nodes['Si'] self.assertTrue(type(node2) != SubTree) self.assertTrue(type(node4) != SubTree) self.assertTrue(type(node5) != SubTree) self.assertTrue(type(node6) != SubTree) #decision nodes self.assertTrue(node0.A_header[node0.attribute] == 'Operacio major') self.assertTrue(node1.A_header[node1.attribute] == 'Familia') self.assertTrue(node3.A_header[node3.attribute] == 'Gran') #leaves self.assertTrue(node2 == 'No') self.assertTrue(node4 == 'Si') self.assertTrue(node5 == 'Si') self.assertTrue(node6 == 'No') def test_predict(self): self.tree.fit(self.df.values[:, 0:3], self.df.values[:, 3]) test = pd.DataFrame( [['No', 'No', 'No'], ['No', 'No', 'Si'], ['No', 'Si', 'No'], ['No', 'Si', 'Si'], ['Si', 'No', 'No'], ['Si', 'No', 'Si'], ['Si', 'Si', 'No'], ['Si', 'Si', 'Si']], columns=['Operacio major', 'Familia', 'Gran']).to_numpy() output = self.tree.predict(test).tolist() expected_output = ['Si', 'No', 'Si', 'Si', 'No', 'No', 'No', 'No'] self.assertListEqual(output, expected_output) def test_nan_gain_entropy(self): s = self.df_nan.values[:, 3] A = self.df_nan.values[:, 0:3] gain_output = [] for a in A.T: gain_output.append(gain(s, a)) expected_entrpy = -3 / 5 * log(3 / 5, 2) - 2 / 5 * log(2 / 5, 2) expected_gain = [] expected_gain.append(4 / 5 * (expected_entrpy - 3 / 4 * (-1 / 3 * log(1 / 3, 2) - 2 / 3 * log(2 / 3, 2))) ) # operacio major expected_gain.append( expected_entrpy - 4 / 5 * (-3 / 4 * log(3 / 4, 2) - 1 / 4 * log(1 / 4, 2))) # familia expected_gain.append( 3 / 5 * (expected_entrpy - 2 / 3 * (-1 / 2 * log(1 / 2, 2) - 1 / 2 * log(1 / 2, 2)))) # gran for i in range(3): self.assertTrue(gain_output[i] == expected_gain[i]) def test_gini_nan(self): s = self.df_nan.values[:, 3] A = self.df_nan.values[:, 0:3] gin = gini(s) gin_gain = [] for a in A.T: gin_gain.append(gini_gain(s, a)) expected_gini = 1 - 3 / 5 * 3 / 5 - 2 / 5 * 2 / 5 expected_gini_gain = [] expected_gini_gain.append( 4 / 5 * (expected_gini - 3 / 4 * (1 - 1 / 3 * 1 / 3 - 2 / 3 * 2 / 3))) #operacio major expected_gini_gain.append( expected_gini - 4 / 5 * (1 - 3 / 4 * 3 / 4 - 1 / 4 * 1 / 4)) #familia expected_gini_gain.append( 3 / 5 * (expected_gini - 2 / 3 * (1 - 1 / 2 * 1 / 2 - 1 / 2 * 1 / 2))) # gran self.assertTrue(gin == expected_gini) for i in range(3): self.assertTrue(gin_gain[i] == expected_gini_gain[i]) def test_predict_nan(self): self.tree2.fit(self.df2.values[:, 0:2], self.df2.values[:, 2]) test = pd.DataFrame([['?', 'c2'], ['?', '?']], columns=['B', 'C']).to_numpy() output = self.tree2.predict(test).tolist() expected_output = ['No', 'Yes'] probabilities1 = self.tree2.model.predict_nan_value(['?', 'c2']) probabilities2 = self.tree2.model.predict_nan_value(['?', '?']) expected_probabilities1 = 8 / 12, 4 / 12 expected_probabilities2 = (8 / 12) * (5 / 8), (4 / 12 + (8 / 12) * (3 / 8)) self.assertListEqual(output, expected_output) self.assertTupleEqual(probabilities1, expected_probabilities1) self.assertTrue( abs(probabilities2[0] - expected_probabilities2[0]) < 1E-15) #truncation error self.assertEqual(probabilities2[1], probabilities2[1])
X, index = np.unique(X, axis=0, return_index=True) mean = X.mean(axis=0) std = X.std(axis=0) X = (X - mean) / std y = iris.target[index] # # a = NodeByID3(X, y, attributes=['色泽', '根蒂', '敲声', '纹理', '脐部', '触感']) # # a.fit() # # a = NodeByC4_dot_5(X, y, attributes=['密度', '含糖率']) # # a.fit() classifier1 = DecisionTreeClassifier(criterion='GINI') #, max_depth=4) classifier1.fit(X, y) x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02)) Z = classifier1.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) # index = (Z == '是') # Z[index] = 1 # Z[~index] = 0 # Z.astype('int') cs = plt.contourf(xx, yy, Z, alpha=0.5) plt.axis('tight') colors = [[127 / 255, 127 / 255, 227 / 255], [163 / 255, 1, 213 / 255], [1, 127 / 255, 127 / 255]] for i, color in zip([0, 1, 2], colors): idx = np.where(y == i) plt.scatter(X[idx, 0], X[idx, 1], c=color) # for i in range(6): # print(a._IV(i)) plt.show()
from tree import DecisionTreeClassifier import pandas as pd # optional clf = DecisionTreeClassifier() X = [[181, 80, 44], [177, 70, 43], [160, 60, 38], [154, 54, 37], [166, 65, 40], [190, 90, 47], [175, 64, 39], [177, 70, 40], [159, 55, 37], [171, 75, 42], [181, 85, 43]] Y = [ 'male', 'male', 'female', 'female', 'male', 'male', 'female', 'female', 'female', 'male', 'male' ] data = pd.DataFrame(X, Y, columns=['Height', 'Weight', 'Foot Size']) print(data) clf = clf.fit(X, Y) questions = [[190, 70, 43], [175, 55, 40]] predictions = clf.predict(questions) print('\nPredictions:') for i, prediction in enumerate(predictions): print(questions[i], prediction)
from tree import DecisionTreeClassifier from sklearn.tree import DecisionTreeRegressor data = pd.read_csv('german_credit.csv') target = data[data.columns[0]] train = data[data.columns[1:]] m_d = 7 boston = load_boston() X, X_test, y, y_test = train_test_split(boston.data, boston.target, test_size=0.25) model = DecisionTreeClassifier(max_depth=m_d) print X[:10] model.fit(X, y) # a = [] print y_test[:10] # for i in range(0, y.shape[0], 1): # a.append(model.predict(X[i])) # print a[:10] a = model.predict(X_test) print a[:10] print math.sqrt(np.sum((y_test - a)**2) / float(len(a))) model2 = DecisionTreeRegressor(max_depth=m_d) model2.fit(X, y) b = model2.predict(X_test) #print model2 print b[:10] print math.sqrt(np.sum((y_test - b)**2) / float(len(b)))
import pandas as pd from tree import DecisionTreeClassifier from metrics import accuracy_score from utils import train_test_split if __name__ == '__main__': column_names = ['parents', 'has_nurs', 'form', 'children', 'housing', 'finance', 'social', 'health', 'classes'] data = pd.read_csv('./nursery.data', names=column_names) X = data.iloc[:, :-1] y = data.iloc[:, -1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=666) dt = DecisionTreeClassifier() dt.fit(X, y) y_pred = dt.predict(X_test) print(y_pred) print() score = accuracy_score(y_test, y_pred) print(score)