Example #1
0
def decision_tree_various_depth(x_train, y_train, x_test, y_test):
    print('Decision Tree with depths 1-25 (inclusive)\n')

    # these will keep our points
    graphTrain = []
    graphTest = []
    graphF1 = []

    # perform decision tree testing for each depth
    # i'd like to use the decision_tree_testing function here, but we need to set the proper depth for each iteration
    for layer in range(1, 26):
        print('Current depth: ', layer)
        clf = DecisionTreeClassifier(max_depth=layer)
        clf.fit(x_train, y_train)
        preds_train = clf.predict(x_train)
        preds_test = clf.predict(x_test)
        graphTrain.append(accuracy_score(preds_train, y_train))
        graphTest.append(accuracy_score(preds_test, y_test))
        print('Train {}'.format(accuracy_score(preds_train, y_train)))
        print('Test {}'.format(accuracy_score(preds_test, y_test)))
        preds = clf.predict(x_test)
        print('F1 Test {}\n'.format(f1(y_test, preds)))
        graphF1.append(f1(y_test, preds))

    table = pd.DataFrame({
        "Max Depth": [item for item in range(1, 26)],
        "Train Accuracy": graphTrain,
        "Test Accuracy": graphTest,
        "F1 Accuracy": graphF1
    })
    print(table)

    # plot our graph and output to a file
    plt.xlabel('Depth')
    plt.ylabel('Performance')
    plt.title('Accuracy & F1 Score vs Number of Trees')
    plt.plot('Max Depth', 'Train Accuracy', data=table, color='blue')
    plt.plot('Max Depth', 'Test Accuracy', data=table, color='green')
    plt.plot('Max Depth', 'F1 Accuracy', data=table, color='red')
    plt.legend()
    plt.savefig('q1.png')

    # get best depth in terms of validation accuracy
    topAccuracy = max(graphF1)
    print("The depth that gives the best validation accuracy is: ",
          [item for item in range(1, 26)][graphF1.index(topAccuracy)],
          "which has an F1 accuracy of ", topAccuracy)

    # get the most important feature for making a prediction
    clfMVP = DecisionTreeClassifier(
        max_depth=[item for item in range(1, 26)][graphF1.index(topAccuracy)])
    clfMVP.fit(x_train, y_train)
    print("The most important feature for making a prediction is: ",
          clfMVP.root.feature)
    print("The threshold to split on for this feature is: ", clfMVP.root.split)

    # return the most important feature for use in main
    return clfMVP.root.feature
def decision_tree_testing(x_train, y_train, x_test, y_test):
    print('Decision Tree\n\n')
    clf = DecisionTreeClassifier(max_depth=20)
    clf.fit(x_train, y_train)
    preds_train = clf.predict(x_train)
    preds_test = clf.predict(x_test)
    train_accuracy = accuracy_score(preds_train, y_train)
    test_accuracy = accuracy_score(preds_test, y_test)
    print('Train {}'.format(train_accuracy))
    print('Test {}'.format(test_accuracy))
    preds = clf.predict(x_test)
    print('F1 Test {}'.format(f1(y_test, preds)))
Example #3
0
def main():
    columns, x_train, y_train, x_test, y_test = preprocessing()
    random_forest_ID3 = RandomForest(columns[:-1], ['age', 'hours-per-week', 'capital-gain', 'capital-loss'],
                                                                  Criterion.ID3, np.vstack((x_train, x_test)), 10)
    decision_tree_ID3 = DecisionTreeClassifier(columns[:-1], ['age', 'hours-per-week', 'capital-gain', 'capital-loss'],
                                                             Criterion.ID3)

    random_forest_GINI = RandomForest(columns[:-1], ['age', 'hours-per-week', 'capital-gain', 'capital-loss'],
                                                                  Criterion.GINI, np.vstack((x_train, x_test)), 10)
    decision_tree_GINI = DecisionTreeClassifier(columns[:-1], ['age', 'hours-per-week', 'capital-gain', 'capital-loss'],
                                                             Criterion.GINI)
    decision_tree_ID3.set_attribute_values(np.vstack((x_train, x_test)))
    decision_tree_GINI.set_attribute_values(np.vstack((x_train, x_test)))
    validation = Validation(x_train, y_train, x_test, y_test)

    print('K-fold validation:\n\n')
    print('Criteri ID3:\n')
    print('Random forest:\n')
    score = validation.score_cross_val(3, random_forest_ID3)
    print(f'Accuracy mitjana: {np.array(score[Measure.ACC]).mean()}\n')
    print(f'Specificity mitjana: {np.array(score[Measure.SPEC]).mean()}\n')


    print('Decision tree:\n')
    score = validation.score_cross_val(3, decision_tree_ID3)
    print(f'Accuracy mitjana: {np.array(score[Measure.ACC]).mean()}\n')
    print(f'Specificity mitjana: {np.array(score[Measure.SPEC]).mean()}\n')

    print('Criteri GINI:\n')
    print('Random forest:\n')
    score = validation.score_cross_val(3, random_forest_GINI)
    print(f'Accuracy mitjana: {np.array(score[Measure.ACC]).mean()}\n')
    print(f'Specificity mitjana: {np.array(score[Measure.SPEC]).mean()}\n')


    print('Decision tree:\n')
    score = validation.score_cross_val(3, decision_tree_GINI)
    print(f'Accuracy mitjana: {np.array(score[Measure.ACC]).mean()}\n')
    print(f'Specificity mitjana: {np.array(score[Measure.SPEC]).mean()}\n')

    print('Final model: Random Forest\n')
    print('Resultats finals: \n')
    final_measure = validation.final_measure(random_forest_ID3)
    print(f'Accuracy mitjana: {np.array(final_measure[Measure.ACC]).mean()}\n')
    print(f'Specificity mitjana: {np.array(final_measure[Measure.SPEC]).mean()}\n')


    print('\n\n Exemple d arbre de decisió entrenat amb totes les dades disponible a out/resultat.txt')
    #Imprimim un arbre de decisió entrenat amb totes les dades, per visualitzar, tot i no ser el millor model
    x_data = np.vstack((x_train, x_test))
    y_data = np.hstack((y_train, y_test))
    decision_tree_ID3.fit(x_data, y_data)
    write_to_file(decision_tree_ID3)
def create_trees(x_train, y_train, x_test, y_test, maxdepth):
    #print('Decision Tree\n\n')
    clf = DecisionTreeClassifier(max_depth=maxdepth)
    clf.fit(x_train, y_train)
    preds_train = clf.predict(x_train)
    preds_test = clf.predict(x_test)
    train_accuracy = accuracy_score(preds_train, y_train)
    test_accuracy = accuracy_score(preds_test, y_test)
    #print('Train {}'.format(train_accuracy))
    #print('Test {}'.format(test_accuracy))
    preds = clf.predict(x_test)
    #print('F1 Test {}'.format(f1(y_test, preds)))
    return (f1(y_test, preds)), train_accuracy, test_accuracy
Example #5
0
    def fit(self, X, y):
        #Generate a forest by a random subset of data and features.
        self.forest = []
        n_samples = len(y)
        n_sub_samples = round(n_samples * self.bootstrap)

        for i in xrange(self.num_estimators):
            shuffle_samples(X, y)
            X_subset = X[:n_sub_samples]
            y_subset = y[:n_sub_samples]

            tree = DecisionTreeClassifier(self.max_features, self.max_depth,
                                          self.min_samples_split)
            tree.fit(X_subset, y_subset)
            self.forest.append(tree)
Example #6
0
def decision_tree_tune(x_train, y_train, x_test, y_test):
    print('Decision Tree tune\n\n')
    plotX = [i for i in range(1, 26)]
    plotTrain = []
    plotTest = []
    plotF1 = []

    for depth in range(1, 26):
        print('Math Depth: ', depth)
        clf = DecisionTreeClassifier(max_depth=depth)
        clf.fit(x_train, y_train)
        preds_train = clf.predict(x_train)
        preds_test = clf.predict(x_test)
        train_accuracy = round(accuracy_score(preds_train, y_train), 3)
        test_accuracy = round(accuracy_score(preds_test, y_test), 3)
        print('Train {}'.format(train_accuracy))
        print('Test {}'.format(test_accuracy))
        preds = clf.predict(x_test)
        F1 = round(f1(y_test, preds), 3)
        print('F1 Test {}'.format(F1))
        print('\n')
        plotTrain.append(train_accuracy)
        plotTest.append(test_accuracy)
        plotF1.append(F1)

    df = pd.DataFrame({
        "Max_Depth": plotX,
        "Train_Accuracy": plotTrain,
        "Test_Accuracy": plotTest,
        "F1_Accuracy": plotF1
    })
    print(df)
    maxAccuracy = max(plotF1)
    bestDepth = plotX[plotF1.index(maxAccuracy)]
    print("The best Depth is ", bestDepth, "with F1 accuracy ", maxAccuracy)

    print("Drawing plot")
    plt.plot('Max_Depth', 'Train_Accuracy', data=df, color='red')
    plt.plot('Max_Depth', 'Test_Accuracy', data=df, color='blue')
    plt.plot('Max_Depth', 'F1_Accuracy', data=df, color='black')
    plt.legend()
    plt.savefig('decision_tree_output.png')
    plt.close()
    return bestDepth
Example #7
0
def decision_tree_testing(x_train, y_train, x_test, y_test, max_depth):
	print('Decision Tree')
	print("depth : %d" % max_depth)
	
	clf = DecisionTreeClassifier(max_depth)
	clf.fit(x_train, y_train)
	preds_train = clf.predict(x_train)
	preds_test = clf.predict(x_test)
	train_accuracy = accuracy_score(preds_train, y_train)
	test_accuracy = accuracy_score(preds_test, y_test)
	print('Train {}'.format(train_accuracy))
	print('Test {}'.format(test_accuracy))
	preds = clf.predict(x_test)
	preds_train =clf.predict(x_train)

	print('F1 Train {}'.format(f1(y_train, preds_train)))
	print('F1 Test {}\n'.format(f1(y_test, preds)))
	
	return train_accuracy, test_accuracy, f1(y_train, preds_train), f1(y_test, preds) 
    def fit(self, X, Y):

        n_attr = X.shape[1]
        n_data = X.shape[0]
        m = int(np.sqrt(n_attr))

        for estimator in range(self.n_estimators):
            index_attr = np.array(range(n_attr))
            np.random.seed(estimator)
            np.random.shuffle(index_attr)
            index_attr = index_attr[:m]
            self.index_attr.append(index_attr)

            #bagging
            index_data = np.random.choice(n_data, int(n_data/10), replace=True)

            x = X[index_data, :][:, index_attr]
            y = Y[index_data]
            t = self.attr_headers[index_attr]
            model = DecisionTreeClassifier(self.attr_headers[index_attr], self.contionuous_attr_headers, self.criterion)
            t = np.array(self.attr_values)[index_attr]
            model.set_labels(np.array(self.attr_values)[index_attr])
            model.fit(x, y)
            self.estimators.append(model)
Example #9
0
def decision_tree_testing_depth(x_train, y_train, x_test, y_test, min, max):
    print('#Decision Tree Depth Testing\n\n')
    accuracyTrain = np.zeros(max - min)
    accuracyTest = np.zeros(max - min)
    f1Train = np.zeros(max - min)
    f1Test = np.zeros(max - min)
    depths = np.arange(min, max)
    index = 0
    for depth in depths:
        clf = DecisionTreeClassifier(max_depth=depth)
        clf.fit(x_train, y_train)
        preds_train = clf.predict(x_train)
        preds_test = clf.predict(x_test)
        accuracyTrain[index] = accuracy_score(preds_train, y_train)
        accuracyTest[index] = accuracy_score(preds_test, y_test)
        preds = clf.predict(x_test)
        f1Test[index] = calc_f1(preds_train, y_train)
        f1Train[index] = calc_f1(preds_test, y_test)
        index += 1
    f1 = plt.figure(1)
    plt.plot(depths, accuracyTrain)
    plt.plot(depths, accuracyTest)
    plt.title("accuracy vs number of trees")
    plt.ylabel("Accuracy")
    plt.xlabel("Depth")
    plt.legend(['Training Accuracy', 'Testing Accuracy'])
    f1.show()

    f2 = plt.figure(2)
    plt.plot(depths, f1Train)
    plt.plot(depths, f1Test)
    plt.title("F1 vs number of trees")
    plt.ylabel("F1")
    plt.xlabel("Depth")
    plt.legend(['Training F1', 'Testing F1'])
    plt.show()
Example #10
0
    #                       'odor', 'gill-attachment', 'gill-spacing', 'gill-size',
    #                       'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
    #                       'stalk-surface-below-ring', 'stalk-color-above-ring',
    #                       'stalk-color-below-ring', 'veil-type', 'veil-color',
    #                       'ring-number', 'ring-type', 'spore-print-color',
    #                       'population', 'habitat'], inplace=True)

    data = data.sample(frac=1)

    n_train = int(len(data) * 0.7)
    train = data[:n_train]
    valid = data[n_train:]

    print('\ngenerate CART...')
    clf = DecisionTreeClassifier(type='CART')
    clf.fit(train, parent=None)
    plotTree.CART_Tree(clf.to_dict(), 'test/cart.png')
    print('num_leaves={}'.format(clf.get_num_leaves(clf.tree)))
    print('[train acc]\t{}'.format(clf.validate(train)))
    print('[valid acc][cart]\t{}'.format(clf.validate(valid)))

    print('\nprune CART...')
    pruned = clf.prune_cart(train, valid)
    print('num_leaves={}'.format(clf.get_num_leaves(pruned)))
    pruned_dict = clf.to_dict(pruned)
    plotTree.CART_Tree(pruned_dict, 'test/cart_p.png')
    print('[valid acc][cart_pruned]\t{}'.format(clf.validate(valid, pruned)))

    print('\ngenerate C4.5...')
    clf_C45 = DecisionTreeClassifier(type='C4.5', epsilon=1e-6)
    clf_C45.fit(train, parent=None)
Example #11
0
# X, y = watermelon_data.values[:, :-1], watermelon_data.values[:,-1]
iris = load_iris()
# X = iris.data[50:, :2]
# y = iris.target[50:]
X = iris.data[:, :2]
X, index = np.unique(X, axis=0, return_index=True)
mean = X.mean(axis=0)
std = X.std(axis=0)
X = (X - mean) / std
y = iris.target[index]
# # a = NodeByID3(X, y, attributes=['色泽', '根蒂', '敲声', '纹理', '脐部', '触感'])
# # a.fit()
# # a = NodeByC4_dot_5(X, y, attributes=['密度', '含糖率'])
# # a.fit()
classifier1 = DecisionTreeClassifier(criterion='GINI')  #, max_depth=4)
classifier1.fit(X, y)
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                     np.arange(y_min, y_max, 0.02))
Z = classifier1.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# index = (Z == '是')
# Z[index] = 1
# Z[~index] = 0
# Z.astype('int')
cs = plt.contourf(xx, yy, Z, alpha=0.5)
plt.axis('tight')
colors = [[127 / 255, 127 / 255, 227 / 255], [163 / 255, 1, 213 / 255],
          [1, 127 / 255, 127 / 255]]
for i, color in zip([0, 1, 2], colors):
Example #12
0
from sklearn.datasets import load_boston
from tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

data = pd.read_csv('german_credit.csv')

target = data[data.columns[0]]
train = data[data.columns[1:]]
m_d = 7
boston = load_boston()
X, X_test, y, y_test = train_test_split(boston.data,
                                        boston.target,
                                        test_size=0.25)
model = DecisionTreeClassifier(max_depth=m_d)
print X[:10]
model.fit(X, y)
# a = []
print y_test[:10]
# for i in range(0, y.shape[0], 1):
#     a.append(model.predict(X[i]))
# print a[:10]
a = model.predict(X_test)
print a[:10]
print math.sqrt(np.sum((y_test - a)**2) / float(len(a)))

model2 = DecisionTreeRegressor(max_depth=m_d)
model2.fit(X, y)
b = model2.predict(X_test)
#print model2
print b[:10]
print math.sqrt(np.sum((y_test - b)**2) / float(len(b)))
Example #13
0
def find_most_important_feature(x_train, y_train, depth):
    clf = DecisionTreeClassifier(max_depth=depth)
    clf.fit(x_train, y_train)
    return clf.root.feature
from tree import DecisionTreeClassifier
import pandas as pd  # optional

clf = DecisionTreeClassifier()

X = [[181, 80, 44], [177, 70, 43], [160, 60, 38], [154, 54, 37], [166, 65, 40],
     [190, 90, 47], [175, 64, 39], [177, 70, 40], [159, 55, 37], [171, 75, 42],
     [181, 85, 43]]

Y = [
    'male', 'male', 'female', 'female', 'male', 'male', 'female', 'female',
    'female', 'male', 'male'
]

data = pd.DataFrame(X, Y, columns=['Height', 'Weight', 'Foot Size'])
print(data)

clf = clf.fit(X, Y)

questions = [[190, 70, 43], [175, 55, 40]]
predictions = clf.predict(questions)

print('\nPredictions:')
for i, prediction in enumerate(predictions):
    print(questions[i], prediction)
Example #15
0
class TestCases(unittest.TestCase):
    '''
    El següent conjunt de tests està basat en l'exercici 4 de problemes,
    en el que s'ha calculat un arbre de decisió (amb criteri de selecció ID3) a mà
    '''
    def setUp(self):
        data = [['Si', 'No', 'No', 'No'], ['Si', 'No', 'Si', 'No'],
                ['No', 'No', 'No', 'Si'], ['No', 'No', 'Si', 'No'],
                ['No', 'Si', 'Si', 'Si']]
        self.df = pd.DataFrame(
            data,
            columns=['Operacio major', 'Familia', 'Gran', 'Enviar a casa'])
        self.tree = DecisionTreeClassifier(self.df.columns[:-1], [],
                                           Criterion.ID3)
        self.tree.set_attribute_values(self.df.to_numpy()[:, 0:3])

        data_nan = [['Si', 'No', 'No', 'No'], ['?', 'No', 'Si', 'No'],
                    ['No', 'No', 'No', 'Si'], ['No', 'No', '?', 'No'],
                    ['No', 'Si', '?', 'Si']]
        self.df_nan = pd.DataFrame(
            data_nan,
            columns=['Operacio major', 'Familia', 'Gran', 'Enviar a casa'])
        '''Exemple vist a la diapositiva 76 de teoria (decision trees),  forçem l'arbre que surt en la diapositiva,
        ja que sabem (manualment) que haurien de donar en aquest arbre les prediccioins dels atributs [?, c2], [?, ?]
        '''
        data_2 = [['b1', 'c2', 'Yes'], ['b1', 'c2',
                                        'Yes'], ['b1', 'c2', 'Yes'],
                  ['b1', 'c2', 'Yes'], ['b2', 'c1', 'Yes'],
                  ['b2', 'c1', 'Yes'], ['b2', 'c1', 'Yes'], ['b2', 'c2', 'No'],
                  ['b2', 'c2', 'No'], ['b2', 'c2', 'No'], ['b2', 'c2', 'No'],
                  ['b2', 'c2', 'No']]
        self.df2 = pd.DataFrame(data_2, columns=['A', 'B', 'Objectiu'])
        self.tree2 = DecisionTreeClassifier(self.df2.columns[:-1], [],
                                            Criterion.ID3)
        self.tree2.set_attribute_values(self.df2.to_numpy()[:, 0:2])

    def test_entropy(self):
        s = self.df.values[:, 3]
        A = self.df.values[:, 0:3]
        entrpy = entropy(s)
        entrpy_cond = []
        for a in A.T:
            entrpy_cond.append(entropy_cond(s, a))

        expected_entrpy = -3 / 5 * log(3 / 5, 2) - 2 / 5 * log(2 / 5, 2)
        expected_entrpy_cond = []
        expected_entrpy_cond.append(
            3 / 5 *
            (-1 / 3 * log(1 / 3, 2) - 2 / 3 * log(2 / 3, 2)))  #operacio major
        expected_entrpy_cond.append(
            4 / 5 * (-3 / 4 * log(3 / 4, 2) - 1 / 4 * log(1 / 4, 2)))  #familia
        expected_entrpy_cond.append(
            2 / 5 * (-1 / 2 * log(1 / 2, 2) - 1 / 2 * log(1 / 2, 2)) + 3 / 5 *
            (-2 / 3 * log(2 / 3, 2) - 1 / 3 * log(1 / 3, 2)))  # gran

        self.assertTrue(entrpy == expected_entrpy)
        for i in range(3):
            self.assertTrue(entrpy_cond[i] == expected_entrpy_cond[i])

    def test_gini(self):
        s = self.df.values[:, 3]
        A = self.df.values[:, 0:3]
        gin = gini(s)
        gin_gain = []
        for a in A.T:
            gin_gain.append(gini_gain(s, a))

        expected_gini = 1 - 3 / 5 * 3 / 5 - 2 / 5 * 2 / 5
        expected_gini_gain = []
        expected_gini_gain.append(
            expected_gini - 3 / 5 *
            (1 - 1 / 3 * 1 / 3 - 2 / 3 * 2 / 3))  #operacio major
        expected_gini_gain.append(
            expected_gini - 4 / 5 *
            (1 - 3 / 4 * 3 / 4 - 1 / 4 * 1 / 4))  #familia
        expected_gini_gain.append(expected_gini - 2 / 5 *
                                  (1 - 1 / 2 * 1 / 2 - 1 / 2 * 1 / 2) - 3 / 5 *
                                  (1 - 2 / 3 * 2 / 3 - 1 / 3 * 1 / 3))  # gran

        self.assertTrue(gin == expected_gini)
        for i in range(3):
            self.assertTrue(gin_gain[i] == expected_gini_gain[i])

    def test_tree(self):
        self.tree.fit(self.df.values[:, 0:3], self.df.values[:, 3])
        node0 = self.tree.model
        self.assertTrue(type(node0) == SubTree)

        node1 = node0.child_nodes['No']
        node2 = node0.child_nodes['Si']
        self.assertTrue(type(node1) == SubTree)

        node3 = node1.child_nodes['No']
        node4 = node1.child_nodes['Si']
        self.assertTrue(type(node3) == SubTree)

        node5 = node3.child_nodes['No']
        node6 = node3.child_nodes['Si']
        self.assertTrue(type(node2) != SubTree)
        self.assertTrue(type(node4) != SubTree)
        self.assertTrue(type(node5) != SubTree)
        self.assertTrue(type(node6) != SubTree)

        #decision nodes
        self.assertTrue(node0.A_header[node0.attribute] == 'Operacio major')
        self.assertTrue(node1.A_header[node1.attribute] == 'Familia')
        self.assertTrue(node3.A_header[node3.attribute] == 'Gran')

        #leaves
        self.assertTrue(node2 == 'No')
        self.assertTrue(node4 == 'Si')
        self.assertTrue(node5 == 'Si')
        self.assertTrue(node6 == 'No')

    def test_predict(self):
        self.tree.fit(self.df.values[:, 0:3], self.df.values[:, 3])
        test = pd.DataFrame(
            [['No', 'No', 'No'], ['No', 'No', 'Si'], ['No', 'Si', 'No'],
             ['No', 'Si', 'Si'], ['Si', 'No', 'No'], ['Si', 'No', 'Si'],
             ['Si', 'Si', 'No'], ['Si', 'Si', 'Si']],
            columns=['Operacio major', 'Familia', 'Gran']).to_numpy()
        output = self.tree.predict(test).tolist()
        expected_output = ['Si', 'No', 'Si', 'Si', 'No', 'No', 'No', 'No']

        self.assertListEqual(output, expected_output)

    def test_nan_gain_entropy(self):
        s = self.df_nan.values[:, 3]
        A = self.df_nan.values[:, 0:3]

        gain_output = []
        for a in A.T:
            gain_output.append(gain(s, a))

        expected_entrpy = -3 / 5 * log(3 / 5, 2) - 2 / 5 * log(2 / 5, 2)
        expected_gain = []
        expected_gain.append(4 / 5 *
                             (expected_entrpy - 3 / 4 *
                              (-1 / 3 * log(1 / 3, 2) - 2 / 3 * log(2 / 3, 2)))
                             )  # operacio major
        expected_gain.append(
            expected_entrpy - 4 / 5 *
            (-3 / 4 * log(3 / 4, 2) - 1 / 4 * log(1 / 4, 2)))  # familia
        expected_gain.append(
            3 / 5 * (expected_entrpy - 2 / 3 *
                     (-1 / 2 * log(1 / 2, 2) - 1 / 2 * log(1 / 2, 2))))  # gran

        for i in range(3):
            self.assertTrue(gain_output[i] == expected_gain[i])

    def test_gini_nan(self):
        s = self.df_nan.values[:, 3]
        A = self.df_nan.values[:, 0:3]
        gin = gini(s)
        gin_gain = []
        for a in A.T:
            gin_gain.append(gini_gain(s, a))

        expected_gini = 1 - 3 / 5 * 3 / 5 - 2 / 5 * 2 / 5
        expected_gini_gain = []
        expected_gini_gain.append(
            4 / 5 * (expected_gini - 3 / 4 *
                     (1 - 1 / 3 * 1 / 3 - 2 / 3 * 2 / 3)))  #operacio major
        expected_gini_gain.append(
            expected_gini - 4 / 5 *
            (1 - 3 / 4 * 3 / 4 - 1 / 4 * 1 / 4))  #familia
        expected_gini_gain.append(
            3 / 5 * (expected_gini - 2 / 3 *
                     (1 - 1 / 2 * 1 / 2 - 1 / 2 * 1 / 2)))  # gran

        self.assertTrue(gin == expected_gini)
        for i in range(3):
            self.assertTrue(gin_gain[i] == expected_gini_gain[i])

    def test_predict_nan(self):
        self.tree2.fit(self.df2.values[:, 0:2], self.df2.values[:, 2])
        test = pd.DataFrame([['?', 'c2'], ['?', '?']],
                            columns=['B', 'C']).to_numpy()
        output = self.tree2.predict(test).tolist()
        expected_output = ['No', 'Yes']

        probabilities1 = self.tree2.model.predict_nan_value(['?', 'c2'])
        probabilities2 = self.tree2.model.predict_nan_value(['?', '?'])

        expected_probabilities1 = 8 / 12, 4 / 12
        expected_probabilities2 = (8 / 12) * (5 / 8), (4 / 12 + (8 / 12) *
                                                       (3 / 8))

        self.assertListEqual(output, expected_output)
        self.assertTupleEqual(probabilities1, expected_probabilities1)
        self.assertTrue(
            abs(probabilities2[0] - expected_probabilities2[0]) <
            1E-15)  #truncation error
        self.assertEqual(probabilities2[1], probabilities2[1])
    train_size = split * len(dataset)
    dataset_copy = list(dataset)
    while len(train) < train_size:
        index = randrange(len(dataset_copy))
        train.append(dataset_copy.pop(index))
    return train, dataset_copy


# Split out training and test sets to use in model
train, test = train_test(list_of_rows[1:])

# Instantiate manual classifier
clf = DecisionTreeClassifier(max_depth=5, min_samples_split=4)

# Fit / Create the decision tree
tree = clf.fit(train)

# Example of prediction generation
predictions = []
for row in list_of_rows[1:]:
    prediction = clf.predict(tree, row)
    predictions.append(prediction)

# Find accuracy of decision tree train & test data
training_accuracy = clf.accuracy(tree, train)
test_accuracy = clf.accuracy(tree, test)

print(f"Manual Training Accuracy: {training_accuracy:.2%}")
print(f"Manual Test Accuracy: {test_accuracy:.2%}")

# =============================================================================
Example #17
0
import pandas as pd

from tree import DecisionTreeClassifier
from metrics import accuracy_score
from utils import train_test_split

if __name__ == '__main__':
    column_names = ['parents', 'has_nurs', 'form', 'children',
                    'housing', 'finance', 'social', 'health', 'classes']
    data = pd.read_csv('./nursery.data', names=column_names)

    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=666)

    dt = DecisionTreeClassifier()

    dt.fit(X, y)

    y_pred = dt.predict(X_test)
    print(y_pred)
    print()

    score = accuracy_score(y_test, y_pred)
    print(score)