Ejemplo n.º 1
0
def test_accuracy_compare(root, data, prune1, prune2):
    d1 = 0
    d2 = 0

    for point, location in data:
        h1_right = classify(root, point, prune1) == location
        h2_right = classify(root, point, prune2) == location
        
        if h1_right and not h2_right:
            d1 += 1
        if h2_right and not h1_right:
            d2 += 1
    
    return (d1, d2)
Ejemplo n.º 2
0
    def classify(self, row, label=True):
        '''
        Aggregates the results from the decision trees on the given row.
        '''
        agg_res = {}
        for tree in self.trees:
            tree_res = dt.classify(tree, row)
            max_label = None
            max_val = 0
            for k in tree_res.keys():
                if tree_res[k] > max_val:
                    max_label = k
                    max_val = tree_res[k]

            if max_label not in agg_res:
                agg_res[max_label] = 0
            agg_res[max_label] += 1

        if label:
            max_label = None
            max_val = 0
            for k in agg_res.keys():
                if agg_res[k] > max_val:
                    max_label = k
                    max_val = agg_res[k]
            return max_label
        else:
            return agg_res
Ejemplo n.º 3
0
def test_accuracy(root, data, prune=-1):
    wrong = {}
    total = {}
    
    for point, location in data:
        total[location] = total.get(location, 0) + 1.0
        if classify(root, point, prune) != location:
            wrong[location] = wrong.get(location, 0) + 1.0
    
    err_locs = dict( (n, wrong.get(n, 0) / total.get(n, 0)) for n in set(wrong)|set(total) )
    err_all = sum(wrong.values()) / sum(total.values())
    return (err_locs , err_all)
def run_dt(data, tests):

    # Create attributes with all the columns
    attributes = [x for x in range(len(data[0]) - 1)]
    # Initial call
    dt.tree = dt.DTL(data, attributes, False)
    if len(sys.argv) >= 4 and sys.argv[4] == "print":
        dt.printTree(dt.tree, 0)
    outputs = []
    for test in tests:
        outputs.append("yes" if dt.classify(dt.tree, test) else "no")

    return outputs
Ejemplo n.º 5
0
def get_risks(records):
    # UWAGA! Przy każdym uruchomienu trenowane jest drzewo - jeżeli
    # będzie to wolny proces, to można zapisać drzewo do pliku!
    path = str(Path(os.getcwd()).parent) + "/data/artif_data.txt"
    f = open(path, 'r')
    training_data = [line.rstrip().split(',') for line in f]
    header = training_data.pop(0)

    tree = build_tree(training_data)

    predicted_risks = {}

    for area in records:
        predicted_risks[area] = list(classify(records[area], tree).keys())[0]

    return predicted_risks
Ejemplo n.º 6
0
def main():
    test = True
    if len(sys.argv) == 2:
        data = find_files(argv=sys.argv[1:])
    elif len(sys.argv) == 3:
        data = find_files(argv=sys.argv[1:])
        test = False
    else:
        data = find_files()
    data.print_data()

    target_attr = data.attributes[-1]
    tree = id3(data.values, data.attributes, target_attr)

    op = input("Do you want to create picture of tree graph? [y/n]: ")
    if op in 'yY' or op in 'yesYes':
        global graph
        graph = pydot.Dot(graph_type='graph')
        print_tree(data.values, data.attributes, tree)
        f_name = data.name.split('.')[1][1:] + '.png'
        graph.write_png(f_name)
        print("Generated graph to file:", f_name)
    else:
        print_tree(data.values, data.attributes, tree)

    if test:
        op = input("Do you want to classify new examples? [y/n]: ")
    else:
        op = 'Y'
    if op in 'yY' or op in 'yesYes':
        if test:
            test_data = find_files()
        else:
            test_data = find_files(argv=sys.argv[2:])
        if test_data.attributes != data.attributes[:-1]:
            test_data.values.insert(0, test_data.attributes)
            test_data.attributes = data.attributes[:-1]
        try:
            class_results = classify(tree, test_data.values,
                                     test_data.attributes)
            test_data.print_data(data=data, test=class_results)
        except ValueError:
            print("Unable to classify examples in:", test_data.name)
            exit(0)
Ejemplo n.º 7
0
def classify(tree_model, testlabels, testdata):
    """
    预测,多数投票
    :param tree_model: 各基分类器的树结果,list
    :param testlabels: 测试数据的特征标签,list
    :param testdata:  测试数据,list
    :return: 组合分类器结果
    """
    vote = {}
    for tree in tree_model:
        # 使用异常捕捉原因:随机性导致构造的决策树可能未包含某一特征的所有值,导致最后无法预测,对于这类树,直接投0
        try:
            label = dtree.classify(tree, testlabels, testdata)
            if label not in vote.keys():
                vote[label] = 1
            else:
                vote[label] += 1
        except:
            continue
    result = max(zip(vote.values(), vote.keys()))[1]
    return result
Ejemplo n.º 8
0
def alternative_classifier(train_set, train_labels, test_set, test_labels,
                           **kwargs):
    pred_set = []

    train_set_red, test_set_red = reduce_data(train_set, test_set, [9, 12])

    train_data = np.insert(train_set_red, 2, train_labels, axis=1)
    test_data = np.insert(test_set_red, 2, test_labels, axis=1)

    tree = build_tree(train_data)

    for row in test_data:
        prediction = classify(row, tree)
        pred_set.append(prediction)

    accuracy = calculate_accuracy(test_labels, pred_set)
    print(accuracy)

    confusionMatrix = calculate_confusion_matrix(test_labels, pred_set)
    plot_matrix(confusionMatrix)
    plt.show()

    return pred_set
    newTestSet = my_model.transform(test).tolist()
    newTrainSet = my_model.transform(training).tolist()

    ############# Model Building ##############
    for k in range(len(newSet)):
        newSet[k].append(trainingLabels[k])
    passingData = newSet[:]
    models.append(dt.buildtree(passingData))
    #    dt.prune(b,0.1)

    ############# Classification of Test Records ##############

    for j in range(len(newTestSet)):
        if j not in test_classify:
            test_classify[j] = []
        test_classify[j].append(dt.classify(newTestSet[j], models[i]))

    ############# Accuracy Calculations ##############

d = []
f = []
flat = []
for l in test_classify.values():
    flat = []
    d = []
    for m in l:
        d.append(list(m.keys()))
    flat = [item for sublist in d for item in sublist]
    f.append(flat)

count = 0
Ejemplo n.º 10
0
import decision_tree as dtree

data = [
    ['青年', '否', '否', '一般', '否'],
    ['青年', '否', '否', '好', '否'],
    ['青年', '是', '否', '好', '是'],
    ['青年', '是', '是', '一般', '是'],
    ['青年', '否', '否', '一般', '否'],
    ['中年', '否', '否', '一般', '否'],
    ['中年', '否', '否', '好', '否'],
    ['中年', '是', '是', '好', '是'],
    ['中年', '否', '是', '非常好', '是'],
    ['中年', '否', '是', '非常好', '是'],
    ['老年', '否', '是', '非常好', '是'],
    ['老年', '否', '是', '好', '是'],
    ['老年', '是', '否', '好', '是'],
    ['老年', '是', '否', '非常好', '是'],
    ['老年', '否', '否', '一般', '否'],
]
labels = ['年龄', '有工作', '有自己的房子', '信贷情况']
mytree = dtree.create_tree(data, labels)
print(mytree)

testdata = ['青年', '否', '否', '非常好']
testlabel = ['年龄', '有工作', '有自己的房子', '信贷情况']
# 由于在生成决策树模型的时候labels有所改动,所以分类预测时不能直接调用labels
result = dtree.classify(mytree, testlabel, testdata)
print(result)
Ejemplo n.º 11
0
def predict_classify(forest, test):
    predict_cls = []
    for tree in forest:
        cls = decision_tree.classify(tree, test)
        predict_cls.append(cls)
    return decision_tree.max_cnt(predict_cls)
Ejemplo n.º 12
0
def main(argv):
    # run decision tree classifier
    decision_tree.classify()
Ejemplo n.º 13
0
def main():
    headers, data_set = read_dataset("../csv_data/data_set.csv")
    my_tree = build_tree(data_set, headers)
    #print_tree(my_tree)
    print(
        print_leaf(classify([6.44, 21.0, 65.22, 1431.0, 19.0, 99.0], my_tree)))
Ejemplo n.º 14
0
import decision_tree
import json
import tree_plotter

fr = open(r'/home/zhaoguanyi/PycharmProjects/Decision Tree/watermelon.txt')

listWm = [inst.strip().split('\t') for inst in fr.readlines()]  # 读取数据集
print(listWm)
labels = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感']  # 标签
Trees = decision_tree.createTree(listWm, labels)  # 构建决策树

print(json.dumps(Trees, ensure_ascii=False))  # 打印决策树

# 测试
labels = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感']
for i in range(17):
    testData = listWm[i][:6]
    print(testData)
    testClass = decision_tree.classify(Trees, labels, testData)  # 测试
    print(json.dumps(testClass, ensure_ascii=False))

tree_plotter.createPlot(Trees)  # 可视化决策树
Ejemplo n.º 15
0
import decision_tree
import tree_plotter
import numpy as np


# 创建数据集
def createDataSet():
    dataSet = [['可以生存', '有', "鱼类"], ['可以生存', '有', "鱼类"], ['可以生存', '没有', "非鱼类"],
               ['不能生存', '有', "非鱼类"], ['不能生存', '有', "非鱼类"]]
    labels = ['不浮出水面是否可以生存', '是否有脚蹼']
    return dataSet, labels


if __name__ == "__main__":
    dataSet, labels = createDataSet()
    tree = decision_tree.createTree(dataSet, labels)
    print(tree)
    _, labels = createDataSet()
    result = decision_tree.classify(tree, labels, ["不能生存", "有"])
    print(result)
Ejemplo n.º 16
0
	def classify(self, obs):
		"""Returns the predicted value given the parameters."""
		preds = map(lambda tree: dt.classify(obs, tree), self.trees)
		preds = np.median(map(dt.convertToLabel, preds))
		return preds
Ejemplo n.º 17
0
from decision_tree import get_header
from decision_tree import set_header
from decision_tree import get_unique_values
import csv

training_data = []

with open('data.csv', encoding="utf8") as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    for row in readCSV:
        new_row = []
        for item in row[0].split(','):
            new_row.append(item)
        training_data.append(new_row)

my_tree = build_tree(training_data)

print_tree(my_tree)
print()

testing_data = []

for i in range(len(get_header()) - 1):
    ask = 'Введіть ' + str(get_header()[i]) + str(
        get_unique_values(training_data, i)) + ': '
    user_input = input(ask)
    testing_data.append(user_input)

print("Передбачено: %s" % (print_leaf(classify(testing_data, my_tree))))

input()
Ejemplo n.º 18
0
    my_model = PCA(n_components=pca_comps, svd_solver='full')
    newSet = my_model.fit_transform(training).tolist()
    newTestSet = my_model.transform(test).tolist()
    newTrainSet = my_model.transform(training).tolist()

    ############# Model Building ##############
    for i in range(len(newSet)):
        newSet[i].append(trainingLabels[i])
    passingData = newSet[:]
    b = dt.buildtree(passingData)
    dt.prune(b, 0.1)

    ############# Classification of Train Records ##############
    count = 0
    for i in range(len(newTrainSet)):
        a = dt.classify(newTrainSet[i], b)
        for key in a.keys():
            if (key == trainingLabels[i]):
                count = count + 1

    ############# Accuracy Calculations for Training DataSet ##############
    accuracy = (count / len(newTrainSet)) * 100
    final_train_acc += accuracy
    print('Train accuracy:', accuracy)

    ############# Classification of Test Records ##############
    count = 0
    accuracy = 0
    for i in range(len(newTestSet)):
        a = dt.classify(newTestSet[i], b)
        for key in a.keys():
Ejemplo n.º 19
0
import decision_tree

if __name__ == '__main__':
    fr = open('lenses.txt')
    # 读取数据文件的每一行,然后以\t分割成列表
    lenses = [inst.strip().split('\t') for inst in fr.readlines()]
    lensesLabels = ["age", "prescript", "astigmatic", "tearRate"]
    # 使用decision_tree实现的createTree()函数创建决策树
    lensesTree = decision_tree.createTree(lenses, lensesLabels)
    print(lensesTree)

    # 注意,我们在使用分类器时,要重新传入分类标签列表,不能重用前面的分类标签列表。因为在创建决策树函数中,会删除标签列表里的数据。
    labels = ["age", "prescript", "astigmatic", "tearRate"]
    # 使用分类器函数预测未知数据
    result = decision_tree.classify(lensesTree, labels,
                                    ["young", "hyper", "yes", "reduced"])
    print(result)
Ejemplo n.º 20
0
x_train, x_test, y_train, y_test = train_test_split(features_train, labels_train, test_size=0.33)

# concatenate features and labels
data_train = np.column_stack((x_train, y_train))
data_test = np.column_stack((x_test, y_test))

# build decision tree using entropy
decision_tree = dt.buildtree(data_train, dt.entropy, 0.01)

min_gain_error = {}
# test minimal gain values for pruning
for min_gain_value in np.arange(0,1, 0.01):
    dt_temp = copy.copy(decision_tree)
    dt.prune(dt_temp, min_gain_value)
    # classify test data
    y_hat = map(lambda obs : dt.classify(obs, dt_temp), x_test)
    y_hat = map(dt.convertToLabel, y_hat)
    y_hat = np.array(y_hat)
    error = (y_hat != y_test).sum() / float(y_test.shape[0])
    min_gain_error[min_gain_value] = error

# prune tree with optimal min_gain value

min_gain_opt = min(dict.items(min_gain_error))[0]

dt.prune(decision_tree, min_gain_opt)

# print and draw decision tree
# dt.drawtree(decision_tree,png='census_decision_tree.png')
# dt.printtree(decision_tree)
def forest_classify(trees, input):
    votes = [classify(tree, input) for tree in trees]

    vote_counts = Counter(votes)
    return vote_counts.most_common(1)[0][0]
Ejemplo n.º 22
0
    my_model = PCA(n_components=pca_comps, svd_solver='full')
    newSet = my_model.fit_transform(rows_total).tolist()
    newtestSet = my_model.transform(rows_test_total).tolist()

    ############# Model Building ##############

    for i in range(len(rows_total)):
        newSet[i].append(training_labels[i])
    b = dt.buildtree(newSet)
    dt.prune(b, 0.1)

    ############# Classification of Test Records ##############
    number = 0
    accuracy = 0
    for i in range(testSize):
        a = dt.classify(newtestSet[i], b)
        for key in a.keys():
            if (key == testing_labels[i]):
                number = number + 1

    ############# Accuracy Calculations ##############

    accuracy = (number / testSize) * 100
    final_test_acc += accuracy
    print('Test accuracy:', accuracy)

    ############# Classification of Training Records ##############
    number = 0
    accuracy = 0
    train_label = []
    for i in range(trainSize):