Python DecisionTreeClassifier.predictの例、tree.DecisionTreeClassifier.predict Pythonの例

コード例 #1

0

ファイルを表示

ファイル: main.py プロジェクト: lunng/CS434_A3

def decision_tree_various_depth(x_train, y_train, x_test, y_test):
    print('Decision Tree with depths 1-25 (inclusive)\n')

    # these will keep our points
    graphTrain = []
    graphTest = []
    graphF1 = []

    # perform decision tree testing for each depth
    # i'd like to use the decision_tree_testing function here, but we need to set the proper depth for each iteration
    for layer in range(1, 26):
        print('Current depth: ', layer)
        clf = DecisionTreeClassifier(max_depth=layer)
        clf.fit(x_train, y_train)
        preds_train = clf.predict(x_train)
        preds_test = clf.predict(x_test)
        graphTrain.append(accuracy_score(preds_train, y_train))
        graphTest.append(accuracy_score(preds_test, y_test))
        print('Train {}'.format(accuracy_score(preds_train, y_train)))
        print('Test {}'.format(accuracy_score(preds_test, y_test)))
        preds = clf.predict(x_test)
        print('F1 Test {}\n'.format(f1(y_test, preds)))
        graphF1.append(f1(y_test, preds))

    table = pd.DataFrame({
        "Max Depth": [item for item in range(1, 26)],
        "Train Accuracy": graphTrain,
        "Test Accuracy": graphTest,
        "F1 Accuracy": graphF1
    })
    print(table)

    # plot our graph and output to a file
    plt.xlabel('Depth')
    plt.ylabel('Performance')
    plt.title('Accuracy & F1 Score vs Number of Trees')
    plt.plot('Max Depth', 'Train Accuracy', data=table, color='blue')
    plt.plot('Max Depth', 'Test Accuracy', data=table, color='green')
    plt.plot('Max Depth', 'F1 Accuracy', data=table, color='red')
    plt.legend()
    plt.savefig('q1.png')

    # get best depth in terms of validation accuracy
    topAccuracy = max(graphF1)
    print("The depth that gives the best validation accuracy is: ",
          [item for item in range(1, 26)][graphF1.index(topAccuracy)],
          "which has an F1 accuracy of ", topAccuracy)

    # get the most important feature for making a prediction
    clfMVP = DecisionTreeClassifier(
        max_depth=[item for item in range(1, 26)][graphF1.index(topAccuracy)])
    clfMVP.fit(x_train, y_train)
    print("The most important feature for making a prediction is: ",
          clfMVP.root.feature)
    print("The threshold to split on for this feature is: ", clfMVP.root.split)

    # return the most important feature for use in main
    return clfMVP.root.feature

コード例 #2

0

ファイルを表示

ファイル: main.py プロジェクト: Terminator-Creators/CS434_Assignment3

def decision_tree_testing(x_train, y_train, x_test, y_test):
    print('Decision Tree\n\n')
    clf = DecisionTreeClassifier(max_depth=20)
    clf.fit(x_train, y_train)
    preds_train = clf.predict(x_train)
    preds_test = clf.predict(x_test)
    train_accuracy = accuracy_score(preds_train, y_train)
    test_accuracy = accuracy_score(preds_test, y_test)
    print('Train {}'.format(train_accuracy))
    print('Test {}'.format(test_accuracy))
    preds = clf.predict(x_test)
    print('F1 Test {}'.format(f1(y_test, preds)))

コード例 #3

0

ファイルを表示

ファイル: main.py プロジェクト: Terminator-Creators/CS434_Assignment3

def create_trees(x_train, y_train, x_test, y_test, maxdepth):
    #print('Decision Tree\n\n')
    clf = DecisionTreeClassifier(max_depth=maxdepth)
    clf.fit(x_train, y_train)
    preds_train = clf.predict(x_train)
    preds_test = clf.predict(x_test)
    train_accuracy = accuracy_score(preds_train, y_train)
    test_accuracy = accuracy_score(preds_test, y_test)
    #print('Train {}'.format(train_accuracy))
    #print('Test {}'.format(test_accuracy))
    preds = clf.predict(x_test)
    #print('F1 Test {}'.format(f1(y_test, preds)))
    return (f1(y_test, preds)), train_accuracy, test_accuracy

コード例 #4

0

ファイルを表示

ファイル: main.py プロジェクト: CS434Group8/assign3

def decision_tree_tune(x_train, y_train, x_test, y_test):
    print('Decision Tree tune\n\n')
    plotX = [i for i in range(1, 26)]
    plotTrain = []
    plotTest = []
    plotF1 = []

    for depth in range(1, 26):
        print('Math Depth: ', depth)
        clf = DecisionTreeClassifier(max_depth=depth)
        clf.fit(x_train, y_train)
        preds_train = clf.predict(x_train)
        preds_test = clf.predict(x_test)
        train_accuracy = round(accuracy_score(preds_train, y_train), 3)
        test_accuracy = round(accuracy_score(preds_test, y_test), 3)
        print('Train {}'.format(train_accuracy))
        print('Test {}'.format(test_accuracy))
        preds = clf.predict(x_test)
        F1 = round(f1(y_test, preds), 3)
        print('F1 Test {}'.format(F1))
        print('\n')
        plotTrain.append(train_accuracy)
        plotTest.append(test_accuracy)
        plotF1.append(F1)

    df = pd.DataFrame({
        "Max_Depth": plotX,
        "Train_Accuracy": plotTrain,
        "Test_Accuracy": plotTest,
        "F1_Accuracy": plotF1
    })
    print(df)
    maxAccuracy = max(plotF1)
    bestDepth = plotX[plotF1.index(maxAccuracy)]
    print("The best Depth is ", bestDepth, "with F1 accuracy ", maxAccuracy)

    print("Drawing plot")
    plt.plot('Max_Depth', 'Train_Accuracy', data=df, color='red')
    plt.plot('Max_Depth', 'Test_Accuracy', data=df, color='blue')
    plt.plot('Max_Depth', 'F1_Accuracy', data=df, color='black')
    plt.legend()
    plt.savefig('decision_tree_output.png')
    plt.close()
    return bestDepth

コード例 #5

0

ファイルを表示

ファイル: main.py プロジェクト: wnsgur4322/CS-434

def decision_tree_testing(x_train, y_train, x_test, y_test, max_depth):
	print('Decision Tree')
	print("depth : %d" % max_depth)
	
	clf = DecisionTreeClassifier(max_depth)
	clf.fit(x_train, y_train)
	preds_train = clf.predict(x_train)
	preds_test = clf.predict(x_test)
	train_accuracy = accuracy_score(preds_train, y_train)
	test_accuracy = accuracy_score(preds_test, y_test)
	print('Train {}'.format(train_accuracy))
	print('Test {}'.format(test_accuracy))
	preds = clf.predict(x_test)
	preds_train =clf.predict(x_train)

	print('F1 Train {}'.format(f1(y_train, preds_train)))
	print('F1 Test {}\n'.format(f1(y_test, preds)))
	
	return train_accuracy, test_accuracy, f1(y_train, preds_train), f1(y_test, preds)

コード例 #6

0

ファイルを表示

def decision_tree_testing_depth(x_train, y_train, x_test, y_test, min, max):
    print('#Decision Tree Depth Testing\n\n')
    accuracyTrain = np.zeros(max - min)
    accuracyTest = np.zeros(max - min)
    f1Train = np.zeros(max - min)
    f1Test = np.zeros(max - min)
    depths = np.arange(min, max)
    index = 0
    for depth in depths:
        clf = DecisionTreeClassifier(max_depth=depth)
        clf.fit(x_train, y_train)
        preds_train = clf.predict(x_train)
        preds_test = clf.predict(x_test)
        accuracyTrain[index] = accuracy_score(preds_train, y_train)
        accuracyTest[index] = accuracy_score(preds_test, y_test)
        preds = clf.predict(x_test)
        f1Test[index] = calc_f1(preds_train, y_train)
        f1Train[index] = calc_f1(preds_test, y_test)
        index += 1
    f1 = plt.figure(1)
    plt.plot(depths, accuracyTrain)
    plt.plot(depths, accuracyTest)
    plt.title("accuracy vs number of trees")
    plt.ylabel("Accuracy")
    plt.xlabel("Depth")
    plt.legend(['Training Accuracy', 'Testing Accuracy'])
    f1.show()

    f2 = plt.figure(2)
    plt.plot(depths, f1Train)
    plt.plot(depths, f1Test)
    plt.title("F1 vs number of trees")
    plt.ylabel("F1")
    plt.xlabel("Depth")
    plt.legend(['Training F1', 'Testing F1'])
    plt.show()

コード例 #7

0

ファイルを表示

ファイル: example.py プロジェクト: lorischl-otter/decision_tree_by_hand

    return train, dataset_copy


# Split out training and test sets to use in model
train, test = train_test(list_of_rows[1:])

# Instantiate manual classifier
clf = DecisionTreeClassifier(max_depth=5, min_samples_split=4)

# Fit / Create the decision tree
tree = clf.fit(train)

# Example of prediction generation
predictions = []
for row in list_of_rows[1:]:
    prediction = clf.predict(tree, row)
    predictions.append(prediction)

# Find accuracy of decision tree train & test data
training_accuracy = clf.accuracy(tree, train)
test_accuracy = clf.accuracy(tree, test)

print(f"Manual Training Accuracy: {training_accuracy:.2%}")
print(f"Manual Test Accuracy: {test_accuracy:.2%}")

# =============================================================================
# Compare to actual function using pandas and sklearn
# =============================================================================

df = pd.read_csv("iris.csv")
train, test = train_test_split(df,

コード例 #8

0

ファイルを表示

class TestCases(unittest.TestCase):
    '''
    El següent conjunt de tests està basat en l'exercici 4 de problemes,
    en el que s'ha calculat un arbre de decisió (amb criteri de selecció ID3) a mà
    '''
    def setUp(self):
        data = [['Si', 'No', 'No', 'No'], ['Si', 'No', 'Si', 'No'],
                ['No', 'No', 'No', 'Si'], ['No', 'No', 'Si', 'No'],
                ['No', 'Si', 'Si', 'Si']]
        self.df = pd.DataFrame(
            data,
            columns=['Operacio major', 'Familia', 'Gran', 'Enviar a casa'])
        self.tree = DecisionTreeClassifier(self.df.columns[:-1], [],
                                           Criterion.ID3)
        self.tree.set_attribute_values(self.df.to_numpy()[:, 0:3])

        data_nan = [['Si', 'No', 'No', 'No'], ['?', 'No', 'Si', 'No'],
                    ['No', 'No', 'No', 'Si'], ['No', 'No', '?', 'No'],
                    ['No', 'Si', '?', 'Si']]
        self.df_nan = pd.DataFrame(
            data_nan,
            columns=['Operacio major', 'Familia', 'Gran', 'Enviar a casa'])
        '''Exemple vist a la diapositiva 76 de teoria (decision trees),  forçem l'arbre que surt en la diapositiva,
        ja que sabem (manualment) que haurien de donar en aquest arbre les prediccioins dels atributs [?, c2], [?, ?]
        '''
        data_2 = [['b1', 'c2', 'Yes'], ['b1', 'c2',
                                        'Yes'], ['b1', 'c2', 'Yes'],
                  ['b1', 'c2', 'Yes'], ['b2', 'c1', 'Yes'],
                  ['b2', 'c1', 'Yes'], ['b2', 'c1', 'Yes'], ['b2', 'c2', 'No'],
                  ['b2', 'c2', 'No'], ['b2', 'c2', 'No'], ['b2', 'c2', 'No'],
                  ['b2', 'c2', 'No']]
        self.df2 = pd.DataFrame(data_2, columns=['A', 'B', 'Objectiu'])
        self.tree2 = DecisionTreeClassifier(self.df2.columns[:-1], [],
                                            Criterion.ID3)
        self.tree2.set_attribute_values(self.df2.to_numpy()[:, 0:2])

    def test_entropy(self):
        s = self.df.values[:, 3]
        A = self.df.values[:, 0:3]
        entrpy = entropy(s)
        entrpy_cond = []
        for a in A.T:
            entrpy_cond.append(entropy_cond(s, a))

        expected_entrpy = -3 / 5 * log(3 / 5, 2) - 2 / 5 * log(2 / 5, 2)
        expected_entrpy_cond = []
        expected_entrpy_cond.append(
            3 / 5 *
            (-1 / 3 * log(1 / 3, 2) - 2 / 3 * log(2 / 3, 2)))  #operacio major
        expected_entrpy_cond.append(
            4 / 5 * (-3 / 4 * log(3 / 4, 2) - 1 / 4 * log(1 / 4, 2)))  #familia
        expected_entrpy_cond.append(
            2 / 5 * (-1 / 2 * log(1 / 2, 2) - 1 / 2 * log(1 / 2, 2)) + 3 / 5 *
            (-2 / 3 * log(2 / 3, 2) - 1 / 3 * log(1 / 3, 2)))  # gran

        self.assertTrue(entrpy == expected_entrpy)
        for i in range(3):
            self.assertTrue(entrpy_cond[i] == expected_entrpy_cond[i])

    def test_gini(self):
        s = self.df.values[:, 3]
        A = self.df.values[:, 0:3]
        gin = gini(s)
        gin_gain = []
        for a in A.T:
            gin_gain.append(gini_gain(s, a))

        expected_gini = 1 - 3 / 5 * 3 / 5 - 2 / 5 * 2 / 5
        expected_gini_gain = []
        expected_gini_gain.append(
            expected_gini - 3 / 5 *
            (1 - 1 / 3 * 1 / 3 - 2 / 3 * 2 / 3))  #operacio major
        expected_gini_gain.append(
            expected_gini - 4 / 5 *
            (1 - 3 / 4 * 3 / 4 - 1 / 4 * 1 / 4))  #familia
        expected_gini_gain.append(expected_gini - 2 / 5 *
                                  (1 - 1 / 2 * 1 / 2 - 1 / 2 * 1 / 2) - 3 / 5 *
                                  (1 - 2 / 3 * 2 / 3 - 1 / 3 * 1 / 3))  # gran

        self.assertTrue(gin == expected_gini)
        for i in range(3):
            self.assertTrue(gin_gain[i] == expected_gini_gain[i])

    def test_tree(self):
        self.tree.fit(self.df.values[:, 0:3], self.df.values[:, 3])
        node0 = self.tree.model
        self.assertTrue(type(node0) == SubTree)

        node1 = node0.child_nodes['No']
        node2 = node0.child_nodes['Si']
        self.assertTrue(type(node1) == SubTree)

        node3 = node1.child_nodes['No']
        node4 = node1.child_nodes['Si']
        self.assertTrue(type(node3) == SubTree)

        node5 = node3.child_nodes['No']
        node6 = node3.child_nodes['Si']
        self.assertTrue(type(node2) != SubTree)
        self.assertTrue(type(node4) != SubTree)
        self.assertTrue(type(node5) != SubTree)
        self.assertTrue(type(node6) != SubTree)

        #decision nodes
        self.assertTrue(node0.A_header[node0.attribute] == 'Operacio major')
        self.assertTrue(node1.A_header[node1.attribute] == 'Familia')
        self.assertTrue(node3.A_header[node3.attribute] == 'Gran')

        #leaves
        self.assertTrue(node2 == 'No')
        self.assertTrue(node4 == 'Si')
        self.assertTrue(node5 == 'Si')
        self.assertTrue(node6 == 'No')

    def test_predict(self):
        self.tree.fit(self.df.values[:, 0:3], self.df.values[:, 3])
        test = pd.DataFrame(
            [['No', 'No', 'No'], ['No', 'No', 'Si'], ['No', 'Si', 'No'],
             ['No', 'Si', 'Si'], ['Si', 'No', 'No'], ['Si', 'No', 'Si'],
             ['Si', 'Si', 'No'], ['Si', 'Si', 'Si']],
            columns=['Operacio major', 'Familia', 'Gran']).to_numpy()
        output = self.tree.predict(test).tolist()
        expected_output = ['Si', 'No', 'Si', 'Si', 'No', 'No', 'No', 'No']

        self.assertListEqual(output, expected_output)

    def test_nan_gain_entropy(self):
        s = self.df_nan.values[:, 3]
        A = self.df_nan.values[:, 0:3]

        gain_output = []
        for a in A.T:
            gain_output.append(gain(s, a))

        expected_entrpy = -3 / 5 * log(3 / 5, 2) - 2 / 5 * log(2 / 5, 2)
        expected_gain = []
        expected_gain.append(4 / 5 *
                             (expected_entrpy - 3 / 4 *
                              (-1 / 3 * log(1 / 3, 2) - 2 / 3 * log(2 / 3, 2)))
                             )  # operacio major
        expected_gain.append(
            expected_entrpy - 4 / 5 *
            (-3 / 4 * log(3 / 4, 2) - 1 / 4 * log(1 / 4, 2)))  # familia
        expected_gain.append(
            3 / 5 * (expected_entrpy - 2 / 3 *
                     (-1 / 2 * log(1 / 2, 2) - 1 / 2 * log(1 / 2, 2))))  # gran

        for i in range(3):
            self.assertTrue(gain_output[i] == expected_gain[i])

    def test_gini_nan(self):
        s = self.df_nan.values[:, 3]
        A = self.df_nan.values[:, 0:3]
        gin = gini(s)
        gin_gain = []
        for a in A.T:
            gin_gain.append(gini_gain(s, a))

        expected_gini = 1 - 3 / 5 * 3 / 5 - 2 / 5 * 2 / 5
        expected_gini_gain = []
        expected_gini_gain.append(
            4 / 5 * (expected_gini - 3 / 4 *
                     (1 - 1 / 3 * 1 / 3 - 2 / 3 * 2 / 3)))  #operacio major
        expected_gini_gain.append(
            expected_gini - 4 / 5 *
            (1 - 3 / 4 * 3 / 4 - 1 / 4 * 1 / 4))  #familia
        expected_gini_gain.append(
            3 / 5 * (expected_gini - 2 / 3 *
                     (1 - 1 / 2 * 1 / 2 - 1 / 2 * 1 / 2)))  # gran

        self.assertTrue(gin == expected_gini)
        for i in range(3):
            self.assertTrue(gin_gain[i] == expected_gini_gain[i])

    def test_predict_nan(self):
        self.tree2.fit(self.df2.values[:, 0:2], self.df2.values[:, 2])
        test = pd.DataFrame([['?', 'c2'], ['?', '?']],
                            columns=['B', 'C']).to_numpy()
        output = self.tree2.predict(test).tolist()
        expected_output = ['No', 'Yes']

        probabilities1 = self.tree2.model.predict_nan_value(['?', 'c2'])
        probabilities2 = self.tree2.model.predict_nan_value(['?', '?'])

        expected_probabilities1 = 8 / 12, 4 / 12
        expected_probabilities2 = (8 / 12) * (5 / 8), (4 / 12 + (8 / 12) *
                                                       (3 / 8))

        self.assertListEqual(output, expected_output)
        self.assertTupleEqual(probabilities1, expected_probabilities1)
        self.assertTrue(
            abs(probabilities2[0] - expected_probabilities2[0]) <
            1E-15)  #truncation error
        self.assertEqual(probabilities2[1], probabilities2[1])

コード例 #9

0

ファイルを表示

ファイル: test.py プロジェクト: suyako-to-be-a-programmer/ML

X, index = np.unique(X, axis=0, return_index=True)
mean = X.mean(axis=0)
std = X.std(axis=0)
X = (X - mean) / std
y = iris.target[index]
# # a = NodeByID3(X, y, attributes=['色泽', '根蒂', '敲声', '纹理', '脐部', '触感'])
# # a.fit()
# # a = NodeByC4_dot_5(X, y, attributes=['密度', '含糖率'])
# # a.fit()
classifier1 = DecisionTreeClassifier(criterion='GINI')  #, max_depth=4)
classifier1.fit(X, y)
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                     np.arange(y_min, y_max, 0.02))
Z = classifier1.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# index = (Z == '是')
# Z[index] = 1
# Z[~index] = 0
# Z.astype('int')
cs = plt.contourf(xx, yy, Z, alpha=0.5)
plt.axis('tight')
colors = [[127 / 255, 127 / 255, 227 / 255], [163 / 255, 1, 213 / 255],
          [1, 127 / 255, 127 / 255]]
for i, color in zip([0, 1, 2], colors):
    idx = np.where(y == i)
    plt.scatter(X[idx, 0], X[idx, 1], c=color)
# for i in range(6):
#     print(a._IV(i))
plt.show()

コード例 #10

0

ファイルを表示

ファイル: demo.py プロジェクト: fabritsius/decision-tree-classifier

from tree import DecisionTreeClassifier
import pandas as pd  # optional

clf = DecisionTreeClassifier()

X = [[181, 80, 44], [177, 70, 43], [160, 60, 38], [154, 54, 37], [166, 65, 40],
     [190, 90, 47], [175, 64, 39], [177, 70, 40], [159, 55, 37], [171, 75, 42],
     [181, 85, 43]]

Y = [
    'male', 'male', 'female', 'female', 'male', 'male', 'female', 'female',
    'female', 'male', 'male'
]

data = pd.DataFrame(X, Y, columns=['Height', 'Weight', 'Foot Size'])
print(data)

clf = clf.fit(X, Y)

questions = [[190, 70, 43], [175, 55, 40]]
predictions = clf.predict(questions)

print('\nPredictions:')
for i, prediction in enumerate(predictions):
    print(questions[i], prediction)

コード例 #11

0

ファイルを表示

from tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

data = pd.read_csv('german_credit.csv')

target = data[data.columns[0]]
train = data[data.columns[1:]]
m_d = 7
boston = load_boston()
X, X_test, y, y_test = train_test_split(boston.data,
                                        boston.target,
                                        test_size=0.25)
model = DecisionTreeClassifier(max_depth=m_d)
print X[:10]
model.fit(X, y)
# a = []
print y_test[:10]
# for i in range(0, y.shape[0], 1):
#     a.append(model.predict(X[i]))
# print a[:10]
a = model.predict(X_test)
print a[:10]
print math.sqrt(np.sum((y_test - a)**2) / float(len(a)))

model2 = DecisionTreeRegressor(max_depth=m_d)
model2.fit(X, y)
b = model2.predict(X_test)
#print model2
print b[:10]
print math.sqrt(np.sum((y_test - b)**2) / float(len(b)))

コード例 #12

0

ファイルを表示

import pandas as pd

from tree import DecisionTreeClassifier
from metrics import accuracy_score
from utils import train_test_split

if __name__ == '__main__':
    column_names = ['parents', 'has_nurs', 'form', 'children',
                    'housing', 'finance', 'social', 'health', 'classes']
    data = pd.read_csv('./nursery.data', names=column_names)

    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=666)

    dt = DecisionTreeClassifier()

    dt.fit(X, y)

    y_pred = dt.predict(X_test)
    print(y_pred)
    print()

    score = accuracy_score(y_test, y_pred)
    print(score)