def decision_tree_various_depth(x_train, y_train, x_test, y_test): print('Decision Tree with depths 1-25 (inclusive)\n') # these will keep our points graphTrain = [] graphTest = [] graphF1 = [] # perform decision tree testing for each depth # i'd like to use the decision_tree_testing function here, but we need to set the proper depth for each iteration for layer in range(1, 26): print('Current depth: ', layer) clf = DecisionTreeClassifier(max_depth=layer) clf.fit(x_train, y_train) preds_train = clf.predict(x_train) preds_test = clf.predict(x_test) graphTrain.append(accuracy_score(preds_train, y_train)) graphTest.append(accuracy_score(preds_test, y_test)) print('Train {}'.format(accuracy_score(preds_train, y_train))) print('Test {}'.format(accuracy_score(preds_test, y_test))) preds = clf.predict(x_test) print('F1 Test {}\n'.format(f1(y_test, preds))) graphF1.append(f1(y_test, preds)) table = pd.DataFrame({ "Max Depth": [item for item in range(1, 26)], "Train Accuracy": graphTrain, "Test Accuracy": graphTest, "F1 Accuracy": graphF1 }) print(table) # plot our graph and output to a file plt.xlabel('Depth') plt.ylabel('Performance') plt.title('Accuracy & F1 Score vs Number of Trees') plt.plot('Max Depth', 'Train Accuracy', data=table, color='blue') plt.plot('Max Depth', 'Test Accuracy', data=table, color='green') plt.plot('Max Depth', 'F1 Accuracy', data=table, color='red') plt.legend() plt.savefig('q1.png') # get best depth in terms of validation accuracy topAccuracy = max(graphF1) print("The depth that gives the best validation accuracy is: ", [item for item in range(1, 26)][graphF1.index(topAccuracy)], "which has an F1 accuracy of ", topAccuracy) # get the most important feature for making a prediction clfMVP = DecisionTreeClassifier( max_depth=[item for item in range(1, 26)][graphF1.index(topAccuracy)]) clfMVP.fit(x_train, y_train) print("The most important feature for making a prediction is: ", clfMVP.root.feature) print("The threshold to split on for this feature is: ", clfMVP.root.split) # return the most important feature for use in main return clfMVP.root.feature
def decision_tree_testing(x_train, y_train, x_test, y_test): print('Decision Tree\n\n') clf = DecisionTreeClassifier(max_depth=20) clf.fit(x_train, y_train) preds_train = clf.predict(x_train) preds_test = clf.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = clf.predict(x_test) print('F1 Test {}'.format(f1(y_test, preds)))
def main(): columns, x_train, y_train, x_test, y_test = preprocessing() random_forest_ID3 = RandomForest(columns[:-1], ['age', 'hours-per-week', 'capital-gain', 'capital-loss'], Criterion.ID3, np.vstack((x_train, x_test)), 10) decision_tree_ID3 = DecisionTreeClassifier(columns[:-1], ['age', 'hours-per-week', 'capital-gain', 'capital-loss'], Criterion.ID3) random_forest_GINI = RandomForest(columns[:-1], ['age', 'hours-per-week', 'capital-gain', 'capital-loss'], Criterion.GINI, np.vstack((x_train, x_test)), 10) decision_tree_GINI = DecisionTreeClassifier(columns[:-1], ['age', 'hours-per-week', 'capital-gain', 'capital-loss'], Criterion.GINI) decision_tree_ID3.set_attribute_values(np.vstack((x_train, x_test))) decision_tree_GINI.set_attribute_values(np.vstack((x_train, x_test))) validation = Validation(x_train, y_train, x_test, y_test) print('K-fold validation:\n\n') print('Criteri ID3:\n') print('Random forest:\n') score = validation.score_cross_val(3, random_forest_ID3) print(f'Accuracy mitjana: {np.array(score[Measure.ACC]).mean()}\n') print(f'Specificity mitjana: {np.array(score[Measure.SPEC]).mean()}\n') print('Decision tree:\n') score = validation.score_cross_val(3, decision_tree_ID3) print(f'Accuracy mitjana: {np.array(score[Measure.ACC]).mean()}\n') print(f'Specificity mitjana: {np.array(score[Measure.SPEC]).mean()}\n') print('Criteri GINI:\n') print('Random forest:\n') score = validation.score_cross_val(3, random_forest_GINI) print(f'Accuracy mitjana: {np.array(score[Measure.ACC]).mean()}\n') print(f'Specificity mitjana: {np.array(score[Measure.SPEC]).mean()}\n') print('Decision tree:\n') score = validation.score_cross_val(3, decision_tree_GINI) print(f'Accuracy mitjana: {np.array(score[Measure.ACC]).mean()}\n') print(f'Specificity mitjana: {np.array(score[Measure.SPEC]).mean()}\n') print('Final model: Random Forest\n') print('Resultats finals: \n') final_measure = validation.final_measure(random_forest_ID3) print(f'Accuracy mitjana: {np.array(final_measure[Measure.ACC]).mean()}\n') print(f'Specificity mitjana: {np.array(final_measure[Measure.SPEC]).mean()}\n') print('\n\n Exemple d arbre de decisió entrenat amb totes les dades disponible a out/resultat.txt') #Imprimim un arbre de decisió entrenat amb totes les dades, per visualitzar, tot i no ser el millor model x_data = np.vstack((x_train, x_test)) y_data = np.hstack((y_train, y_test)) decision_tree_ID3.fit(x_data, y_data) write_to_file(decision_tree_ID3)
def create_trees(x_train, y_train, x_test, y_test, maxdepth): #print('Decision Tree\n\n') clf = DecisionTreeClassifier(max_depth=maxdepth) clf.fit(x_train, y_train) preds_train = clf.predict(x_train) preds_test = clf.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) #print('Train {}'.format(train_accuracy)) #print('Test {}'.format(test_accuracy)) preds = clf.predict(x_test) #print('F1 Test {}'.format(f1(y_test, preds))) return (f1(y_test, preds)), train_accuracy, test_accuracy
def fit(self, X, y): #Generate a forest by a random subset of data and features. self.forest = [] n_samples = len(y) n_sub_samples = round(n_samples * self.bootstrap) for i in xrange(self.num_estimators): shuffle_samples(X, y) X_subset = X[:n_sub_samples] y_subset = y[:n_sub_samples] tree = DecisionTreeClassifier(self.max_features, self.max_depth, self.min_samples_split) tree.fit(X_subset, y_subset) self.forest.append(tree)
def decision_tree_tune(x_train, y_train, x_test, y_test): print('Decision Tree tune\n\n') plotX = [i for i in range(1, 26)] plotTrain = [] plotTest = [] plotF1 = [] for depth in range(1, 26): print('Math Depth: ', depth) clf = DecisionTreeClassifier(max_depth=depth) clf.fit(x_train, y_train) preds_train = clf.predict(x_train) preds_test = clf.predict(x_test) train_accuracy = round(accuracy_score(preds_train, y_train), 3) test_accuracy = round(accuracy_score(preds_test, y_test), 3) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = clf.predict(x_test) F1 = round(f1(y_test, preds), 3) print('F1 Test {}'.format(F1)) print('\n') plotTrain.append(train_accuracy) plotTest.append(test_accuracy) plotF1.append(F1) df = pd.DataFrame({ "Max_Depth": plotX, "Train_Accuracy": plotTrain, "Test_Accuracy": plotTest, "F1_Accuracy": plotF1 }) print(df) maxAccuracy = max(plotF1) bestDepth = plotX[plotF1.index(maxAccuracy)] print("The best Depth is ", bestDepth, "with F1 accuracy ", maxAccuracy) print("Drawing plot") plt.plot('Max_Depth', 'Train_Accuracy', data=df, color='red') plt.plot('Max_Depth', 'Test_Accuracy', data=df, color='blue') plt.plot('Max_Depth', 'F1_Accuracy', data=df, color='black') plt.legend() plt.savefig('decision_tree_output.png') plt.close() return bestDepth
def decision_tree_testing(x_train, y_train, x_test, y_test, max_depth): print('Decision Tree') print("depth : %d" % max_depth) clf = DecisionTreeClassifier(max_depth) clf.fit(x_train, y_train) preds_train = clf.predict(x_train) preds_test = clf.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = clf.predict(x_test) preds_train =clf.predict(x_train) print('F1 Train {}'.format(f1(y_train, preds_train))) print('F1 Test {}\n'.format(f1(y_test, preds))) return train_accuracy, test_accuracy, f1(y_train, preds_train), f1(y_test, preds)
def fit(self, X, Y): n_attr = X.shape[1] n_data = X.shape[0] m = int(np.sqrt(n_attr)) for estimator in range(self.n_estimators): index_attr = np.array(range(n_attr)) np.random.seed(estimator) np.random.shuffle(index_attr) index_attr = index_attr[:m] self.index_attr.append(index_attr) #bagging index_data = np.random.choice(n_data, int(n_data/10), replace=True) x = X[index_data, :][:, index_attr] y = Y[index_data] t = self.attr_headers[index_attr] model = DecisionTreeClassifier(self.attr_headers[index_attr], self.contionuous_attr_headers, self.criterion) t = np.array(self.attr_values)[index_attr] model.set_labels(np.array(self.attr_values)[index_attr]) model.fit(x, y) self.estimators.append(model)
def decision_tree_testing_depth(x_train, y_train, x_test, y_test, min, max): print('#Decision Tree Depth Testing\n\n') accuracyTrain = np.zeros(max - min) accuracyTest = np.zeros(max - min) f1Train = np.zeros(max - min) f1Test = np.zeros(max - min) depths = np.arange(min, max) index = 0 for depth in depths: clf = DecisionTreeClassifier(max_depth=depth) clf.fit(x_train, y_train) preds_train = clf.predict(x_train) preds_test = clf.predict(x_test) accuracyTrain[index] = accuracy_score(preds_train, y_train) accuracyTest[index] = accuracy_score(preds_test, y_test) preds = clf.predict(x_test) f1Test[index] = calc_f1(preds_train, y_train) f1Train[index] = calc_f1(preds_test, y_test) index += 1 f1 = plt.figure(1) plt.plot(depths, accuracyTrain) plt.plot(depths, accuracyTest) plt.title("accuracy vs number of trees") plt.ylabel("Accuracy") plt.xlabel("Depth") plt.legend(['Training Accuracy', 'Testing Accuracy']) f1.show() f2 = plt.figure(2) plt.plot(depths, f1Train) plt.plot(depths, f1Test) plt.title("F1 vs number of trees") plt.ylabel("F1") plt.xlabel("Depth") plt.legend(['Training F1', 'Testing F1']) plt.show()
# 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', # 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', # 'stalk-surface-below-ring', 'stalk-color-above-ring', # 'stalk-color-below-ring', 'veil-type', 'veil-color', # 'ring-number', 'ring-type', 'spore-print-color', # 'population', 'habitat'], inplace=True) data = data.sample(frac=1) n_train = int(len(data) * 0.7) train = data[:n_train] valid = data[n_train:] print('\ngenerate CART...') clf = DecisionTreeClassifier(type='CART') clf.fit(train, parent=None) plotTree.CART_Tree(clf.to_dict(), 'test/cart.png') print('num_leaves={}'.format(clf.get_num_leaves(clf.tree))) print('[train acc]\t{}'.format(clf.validate(train))) print('[valid acc][cart]\t{}'.format(clf.validate(valid))) print('\nprune CART...') pruned = clf.prune_cart(train, valid) print('num_leaves={}'.format(clf.get_num_leaves(pruned))) pruned_dict = clf.to_dict(pruned) plotTree.CART_Tree(pruned_dict, 'test/cart_p.png') print('[valid acc][cart_pruned]\t{}'.format(clf.validate(valid, pruned))) print('\ngenerate C4.5...') clf_C45 = DecisionTreeClassifier(type='C4.5', epsilon=1e-6) clf_C45.fit(train, parent=None)
# X, y = watermelon_data.values[:, :-1], watermelon_data.values[:,-1] iris = load_iris() # X = iris.data[50:, :2] # y = iris.target[50:] X = iris.data[:, :2] X, index = np.unique(X, axis=0, return_index=True) mean = X.mean(axis=0) std = X.std(axis=0) X = (X - mean) / std y = iris.target[index] # # a = NodeByID3(X, y, attributes=['色泽', '根蒂', '敲声', '纹理', '脐部', '触感']) # # a.fit() # # a = NodeByC4_dot_5(X, y, attributes=['密度', '含糖率']) # # a.fit() classifier1 = DecisionTreeClassifier(criterion='GINI') #, max_depth=4) classifier1.fit(X, y) x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02)) Z = classifier1.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) # index = (Z == '是') # Z[index] = 1 # Z[~index] = 0 # Z.astype('int') cs = plt.contourf(xx, yy, Z, alpha=0.5) plt.axis('tight') colors = [[127 / 255, 127 / 255, 227 / 255], [163 / 255, 1, 213 / 255], [1, 127 / 255, 127 / 255]] for i, color in zip([0, 1, 2], colors):
from sklearn.datasets import load_boston from tree import DecisionTreeClassifier from sklearn.tree import DecisionTreeRegressor data = pd.read_csv('german_credit.csv') target = data[data.columns[0]] train = data[data.columns[1:]] m_d = 7 boston = load_boston() X, X_test, y, y_test = train_test_split(boston.data, boston.target, test_size=0.25) model = DecisionTreeClassifier(max_depth=m_d) print X[:10] model.fit(X, y) # a = [] print y_test[:10] # for i in range(0, y.shape[0], 1): # a.append(model.predict(X[i])) # print a[:10] a = model.predict(X_test) print a[:10] print math.sqrt(np.sum((y_test - a)**2) / float(len(a))) model2 = DecisionTreeRegressor(max_depth=m_d) model2.fit(X, y) b = model2.predict(X_test) #print model2 print b[:10] print math.sqrt(np.sum((y_test - b)**2) / float(len(b)))
def find_most_important_feature(x_train, y_train, depth): clf = DecisionTreeClassifier(max_depth=depth) clf.fit(x_train, y_train) return clf.root.feature
from tree import DecisionTreeClassifier import pandas as pd # optional clf = DecisionTreeClassifier() X = [[181, 80, 44], [177, 70, 43], [160, 60, 38], [154, 54, 37], [166, 65, 40], [190, 90, 47], [175, 64, 39], [177, 70, 40], [159, 55, 37], [171, 75, 42], [181, 85, 43]] Y = [ 'male', 'male', 'female', 'female', 'male', 'male', 'female', 'female', 'female', 'male', 'male' ] data = pd.DataFrame(X, Y, columns=['Height', 'Weight', 'Foot Size']) print(data) clf = clf.fit(X, Y) questions = [[190, 70, 43], [175, 55, 40]] predictions = clf.predict(questions) print('\nPredictions:') for i, prediction in enumerate(predictions): print(questions[i], prediction)
class TestCases(unittest.TestCase): ''' El següent conjunt de tests està basat en l'exercici 4 de problemes, en el que s'ha calculat un arbre de decisió (amb criteri de selecció ID3) a mà ''' def setUp(self): data = [['Si', 'No', 'No', 'No'], ['Si', 'No', 'Si', 'No'], ['No', 'No', 'No', 'Si'], ['No', 'No', 'Si', 'No'], ['No', 'Si', 'Si', 'Si']] self.df = pd.DataFrame( data, columns=['Operacio major', 'Familia', 'Gran', 'Enviar a casa']) self.tree = DecisionTreeClassifier(self.df.columns[:-1], [], Criterion.ID3) self.tree.set_attribute_values(self.df.to_numpy()[:, 0:3]) data_nan = [['Si', 'No', 'No', 'No'], ['?', 'No', 'Si', 'No'], ['No', 'No', 'No', 'Si'], ['No', 'No', '?', 'No'], ['No', 'Si', '?', 'Si']] self.df_nan = pd.DataFrame( data_nan, columns=['Operacio major', 'Familia', 'Gran', 'Enviar a casa']) '''Exemple vist a la diapositiva 76 de teoria (decision trees), forçem l'arbre que surt en la diapositiva, ja que sabem (manualment) que haurien de donar en aquest arbre les prediccioins dels atributs [?, c2], [?, ?] ''' data_2 = [['b1', 'c2', 'Yes'], ['b1', 'c2', 'Yes'], ['b1', 'c2', 'Yes'], ['b1', 'c2', 'Yes'], ['b2', 'c1', 'Yes'], ['b2', 'c1', 'Yes'], ['b2', 'c1', 'Yes'], ['b2', 'c2', 'No'], ['b2', 'c2', 'No'], ['b2', 'c2', 'No'], ['b2', 'c2', 'No'], ['b2', 'c2', 'No']] self.df2 = pd.DataFrame(data_2, columns=['A', 'B', 'Objectiu']) self.tree2 = DecisionTreeClassifier(self.df2.columns[:-1], [], Criterion.ID3) self.tree2.set_attribute_values(self.df2.to_numpy()[:, 0:2]) def test_entropy(self): s = self.df.values[:, 3] A = self.df.values[:, 0:3] entrpy = entropy(s) entrpy_cond = [] for a in A.T: entrpy_cond.append(entropy_cond(s, a)) expected_entrpy = -3 / 5 * log(3 / 5, 2) - 2 / 5 * log(2 / 5, 2) expected_entrpy_cond = [] expected_entrpy_cond.append( 3 / 5 * (-1 / 3 * log(1 / 3, 2) - 2 / 3 * log(2 / 3, 2))) #operacio major expected_entrpy_cond.append( 4 / 5 * (-3 / 4 * log(3 / 4, 2) - 1 / 4 * log(1 / 4, 2))) #familia expected_entrpy_cond.append( 2 / 5 * (-1 / 2 * log(1 / 2, 2) - 1 / 2 * log(1 / 2, 2)) + 3 / 5 * (-2 / 3 * log(2 / 3, 2) - 1 / 3 * log(1 / 3, 2))) # gran self.assertTrue(entrpy == expected_entrpy) for i in range(3): self.assertTrue(entrpy_cond[i] == expected_entrpy_cond[i]) def test_gini(self): s = self.df.values[:, 3] A = self.df.values[:, 0:3] gin = gini(s) gin_gain = [] for a in A.T: gin_gain.append(gini_gain(s, a)) expected_gini = 1 - 3 / 5 * 3 / 5 - 2 / 5 * 2 / 5 expected_gini_gain = [] expected_gini_gain.append( expected_gini - 3 / 5 * (1 - 1 / 3 * 1 / 3 - 2 / 3 * 2 / 3)) #operacio major expected_gini_gain.append( expected_gini - 4 / 5 * (1 - 3 / 4 * 3 / 4 - 1 / 4 * 1 / 4)) #familia expected_gini_gain.append(expected_gini - 2 / 5 * (1 - 1 / 2 * 1 / 2 - 1 / 2 * 1 / 2) - 3 / 5 * (1 - 2 / 3 * 2 / 3 - 1 / 3 * 1 / 3)) # gran self.assertTrue(gin == expected_gini) for i in range(3): self.assertTrue(gin_gain[i] == expected_gini_gain[i]) def test_tree(self): self.tree.fit(self.df.values[:, 0:3], self.df.values[:, 3]) node0 = self.tree.model self.assertTrue(type(node0) == SubTree) node1 = node0.child_nodes['No'] node2 = node0.child_nodes['Si'] self.assertTrue(type(node1) == SubTree) node3 = node1.child_nodes['No'] node4 = node1.child_nodes['Si'] self.assertTrue(type(node3) == SubTree) node5 = node3.child_nodes['No'] node6 = node3.child_nodes['Si'] self.assertTrue(type(node2) != SubTree) self.assertTrue(type(node4) != SubTree) self.assertTrue(type(node5) != SubTree) self.assertTrue(type(node6) != SubTree) #decision nodes self.assertTrue(node0.A_header[node0.attribute] == 'Operacio major') self.assertTrue(node1.A_header[node1.attribute] == 'Familia') self.assertTrue(node3.A_header[node3.attribute] == 'Gran') #leaves self.assertTrue(node2 == 'No') self.assertTrue(node4 == 'Si') self.assertTrue(node5 == 'Si') self.assertTrue(node6 == 'No') def test_predict(self): self.tree.fit(self.df.values[:, 0:3], self.df.values[:, 3]) test = pd.DataFrame( [['No', 'No', 'No'], ['No', 'No', 'Si'], ['No', 'Si', 'No'], ['No', 'Si', 'Si'], ['Si', 'No', 'No'], ['Si', 'No', 'Si'], ['Si', 'Si', 'No'], ['Si', 'Si', 'Si']], columns=['Operacio major', 'Familia', 'Gran']).to_numpy() output = self.tree.predict(test).tolist() expected_output = ['Si', 'No', 'Si', 'Si', 'No', 'No', 'No', 'No'] self.assertListEqual(output, expected_output) def test_nan_gain_entropy(self): s = self.df_nan.values[:, 3] A = self.df_nan.values[:, 0:3] gain_output = [] for a in A.T: gain_output.append(gain(s, a)) expected_entrpy = -3 / 5 * log(3 / 5, 2) - 2 / 5 * log(2 / 5, 2) expected_gain = [] expected_gain.append(4 / 5 * (expected_entrpy - 3 / 4 * (-1 / 3 * log(1 / 3, 2) - 2 / 3 * log(2 / 3, 2))) ) # operacio major expected_gain.append( expected_entrpy - 4 / 5 * (-3 / 4 * log(3 / 4, 2) - 1 / 4 * log(1 / 4, 2))) # familia expected_gain.append( 3 / 5 * (expected_entrpy - 2 / 3 * (-1 / 2 * log(1 / 2, 2) - 1 / 2 * log(1 / 2, 2)))) # gran for i in range(3): self.assertTrue(gain_output[i] == expected_gain[i]) def test_gini_nan(self): s = self.df_nan.values[:, 3] A = self.df_nan.values[:, 0:3] gin = gini(s) gin_gain = [] for a in A.T: gin_gain.append(gini_gain(s, a)) expected_gini = 1 - 3 / 5 * 3 / 5 - 2 / 5 * 2 / 5 expected_gini_gain = [] expected_gini_gain.append( 4 / 5 * (expected_gini - 3 / 4 * (1 - 1 / 3 * 1 / 3 - 2 / 3 * 2 / 3))) #operacio major expected_gini_gain.append( expected_gini - 4 / 5 * (1 - 3 / 4 * 3 / 4 - 1 / 4 * 1 / 4)) #familia expected_gini_gain.append( 3 / 5 * (expected_gini - 2 / 3 * (1 - 1 / 2 * 1 / 2 - 1 / 2 * 1 / 2))) # gran self.assertTrue(gin == expected_gini) for i in range(3): self.assertTrue(gin_gain[i] == expected_gini_gain[i]) def test_predict_nan(self): self.tree2.fit(self.df2.values[:, 0:2], self.df2.values[:, 2]) test = pd.DataFrame([['?', 'c2'], ['?', '?']], columns=['B', 'C']).to_numpy() output = self.tree2.predict(test).tolist() expected_output = ['No', 'Yes'] probabilities1 = self.tree2.model.predict_nan_value(['?', 'c2']) probabilities2 = self.tree2.model.predict_nan_value(['?', '?']) expected_probabilities1 = 8 / 12, 4 / 12 expected_probabilities2 = (8 / 12) * (5 / 8), (4 / 12 + (8 / 12) * (3 / 8)) self.assertListEqual(output, expected_output) self.assertTupleEqual(probabilities1, expected_probabilities1) self.assertTrue( abs(probabilities2[0] - expected_probabilities2[0]) < 1E-15) #truncation error self.assertEqual(probabilities2[1], probabilities2[1])
train_size = split * len(dataset) dataset_copy = list(dataset) while len(train) < train_size: index = randrange(len(dataset_copy)) train.append(dataset_copy.pop(index)) return train, dataset_copy # Split out training and test sets to use in model train, test = train_test(list_of_rows[1:]) # Instantiate manual classifier clf = DecisionTreeClassifier(max_depth=5, min_samples_split=4) # Fit / Create the decision tree tree = clf.fit(train) # Example of prediction generation predictions = [] for row in list_of_rows[1:]: prediction = clf.predict(tree, row) predictions.append(prediction) # Find accuracy of decision tree train & test data training_accuracy = clf.accuracy(tree, train) test_accuracy = clf.accuracy(tree, test) print(f"Manual Training Accuracy: {training_accuracy:.2%}") print(f"Manual Test Accuracy: {test_accuracy:.2%}") # =============================================================================
import pandas as pd from tree import DecisionTreeClassifier from metrics import accuracy_score from utils import train_test_split if __name__ == '__main__': column_names = ['parents', 'has_nurs', 'form', 'children', 'housing', 'finance', 'social', 'health', 'classes'] data = pd.read_csv('./nursery.data', names=column_names) X = data.iloc[:, :-1] y = data.iloc[:, -1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=666) dt = DecisionTreeClassifier() dt.fit(X, y) y_pred = dt.predict(X_test) print(y_pred) print() score = accuracy_score(y_test, y_pred) print(score)