Ejemplo n.º 1
0
import matplotlib.pyplot as plt
import numpy as np

from loadData import LoadData
from decisionTree import Node, DecisionTree, Evaluate
from inspection import Inspection

if __name__ == '__main__':
    train_input = '../handout/education_train.tsv'
    test_input = '../handout/education_test.tsv'
    train_output = '../result/education_train.labels'
    test_output = '../result/education_test.labels'

    ld = LoadData()
    dataset = ld.load_data(train_input)
    dt = DecisionTree(ld)
    tr_err = []
    te_err = []
    x_arr = []
    print(ld.head)
    for i in range(len(ld.head)):
        root = dt.construct(dataset, i)
        # dt.traverse(root)
        dt.classify(ld.load_data(train_input), root, train_output)
        dt.classify(ld.load_data(test_input), root, test_output)
        with open(train_output, 'r') as f:
            predcol = f.read().splitlines()
        realcol = np.loadtxt(train_input,
                             dtype=str,
                             delimiter='\t',
                             skiprows=1)[:, -1]
Ejemplo n.º 2
0
            self.gi=0
        else:
            count1 = 0
            for item in dataset:
                if item[-1]==dataset[0][-1]:
                    count1+=1
            count2 = len(dataset)-count1
            self.gi = (count1/len(dataset))*(count2/len(dataset))+(count2/len(dataset))*(count1/len(dataset))
        return self.gi

    # evaluate with error_rate and gini_impurity
    def evaluate(self):
        err_rate = self.error_rate(self.ori_dataset)
        gini_impurity = self.gini_impurity(self.ori_dataset)
        return err_rate,gini_impurity


if __name__ == '__main__':
    infile = sys.argv[1]
    outfile = sys.argv[2]
    ld = LoadData()
    ori_dataset = ld.load_data(infile)
    ins = Inspection(ori_dataset)
    eva = ins.evaluate()
    err_rate = eva[0]
    gini_impurity = eva[1]
    with open(outfile, 'w') as f:
        f.writelines("gini_impurity: {}\n".format(gini_impurity))
        f.writelines("error: {}\n".format(err_rate))
    # print(err_rate)
    # print(gini_impurity)
            right_branch = self.construct(new_dataset[1], col_index, depth)
            node.right = right_branch
            # print('col_index:',col_index)
            self.col.remove(col_index)

            return node

    def traverse(self, node):
        if node:
            # print(node.dataset,'\n')
            print(node.depth, '\t', node.attribute)
            self.traverse(node.left)
            self.traverse(node.right)


if __name__ == '__main__':
    ld = LoadData()
    dataset = ld.load_data('../handout/small_train.tsv')
    dt = DecisionTree(dataset, 0)
    ds = dt.divide_dataset(dataset, 1)
    # gini = dt.gini_impurity(dataset,1)
    giga = dt.gini_gain(dataset, 1)
    # col = dt.get_attribute(dataset)
    root = dt.construct(dataset)
    # print(root.left.left.left.right.depth)
    dt.traverse(root)
    # print(root.dataset)

    # print(dataset)
    # print(ds[0])
    # print(ds[1])