コード例 #1
0
ファイル: test_decision_tree.py プロジェクト: egnartsms/card
def test_i_put_result_draw(set_trump):
    set_trump('hearts')
    node = build_tree([Card('clubs', 10)],
                      [Card('clubs', 11)],
                      True)
    assert node_estimate(node) == 0.5
    node = build_tree([Card('clubs', 14)],
                      [Card('hearts', 6)],
                      True)
    assert node_estimate(node) == 0.5
コード例 #2
0
ファイル: test_decision_tree.py プロジェクト: egnartsms/card
def test_rival_puts_rival_wins(set_trump):
    set_trump('hearts')
    node = build_tree([Card('spades', 10)],
                      [Card('clubs', 9)],
                      False)
    assert node_estimate(node) == 0.0
    node = build_tree([Card('clubs', 6), Card('clubs', 8), Card('clubs', 9),
                       Card('hearts', 8)],
                      [Card('clubs', 7)],
                      False)
    assert node_estimate(node) == 0.0
コード例 #3
0
ファイル: test_decision_tree.py プロジェクト: egnartsms/card
def test_i_put_i_win(set_trump, monkeypatch):
    set_trump('hearts')
    monkeypatch.setattr(decision_tree, 'MAXDEPTH', 4)
    node = build_tree([Card('clubs', 10)],
                      [Card('clubs', 9)],
                      True)
    assert node_estimate(node) == 1.0
    node = build_tree([Card('clubs', 6), Card('clubs', 8), Card('clubs', 9),
                       Card('hearts', 8)],
                      [Card('clubs', 7)],
                      True)
    assert node_estimate(node) == 1.0
コード例 #4
0
def random_forest_training(data_train, trees_num):
    '''构建随机森林
    input:  data_train(list):训练数据
            trees_num(int):分类树的个数
    output: trees_result(list):每一棵树的最好划分
            trees_feature(list):每一棵树中对原始特征的选择
    '''
    trees_result = []  # 构建好每一棵树的最好划分
    trees_feature = []
    n = np.shape(data_train)[1]  # 样本的维数
    if n > 2:
        k = int(log(n - 1, 2)) + 1  # 设置特征的个数
    else:
        k = 1
    # 开始构建每一棵树
    for i in xrange(trees_num):
        # 1、随机选择m个样本, k个特征
        data_samples, feature = choose_samples(data_train, k)
        # 2、构建每一棵分类树
        tree = build_tree(data_samples)
        # 3、保存训练好的分类树
        trees_result.append(tree)
        # 4、保存好该分类树使用到的特征
        trees_feature.append(feature)

    return trees_result, trees_feature
コード例 #5
0
def random_forest(trainX, trainy, trees_num, gr=0):
    i = 0
    forest = []
    while i < trees_num:
        trainset = bootstrap(len(trainX))
        #print(trainset)
        feature_num = trainX.shape[1]
        flags = [1] * feature_num
        K = round(np.log2(feature_num))

        subtree = Tree()

        temp_tree = build_tree(trainX[trainset],
                               trainy[trainset],
                               flags,
                               depth=trainX.shape[1],
                               RF=1,
                               K=K,
                               gr=gr)
        subtree.data = temp_tree.data
        subtree.left = temp_tree.left
        subtree.right = temp_tree.right
        forest.append(subtree)
        print("第几棵树", i)
        i = i + 1
    return forest
コード例 #6
0
ファイル: Project4.py プロジェクト: asgray/ML-Class
def proj_demo(name):
    # sample subset of dataset, for readable tree and reasonable runtime
    dat = u.get_data(name).sample(120)
    # split to simple train/test split
    sets = u.split_to_train_test_sets(dat)
    training_set = sets['Training_Set']
    test_set = sets['Test_Set']
    # build tree
    tree = dt.build_tree(training_set)
    # show data subsets and tree
    print('Training Data Sample: \n ', training_set.head())
    print('Test Data Sample : \n', test_set.head())
    pprint(tree)
    sleep(5)
    # classify each item in training set
    total_tests = test_set.shape[0]
    correct_guesses = 0
    print('Sample Classifications:')
    for i in range(total_tests):
        # test each item
        test_item = test_set.iloc[i, :]
        prediction = dt.make_prediction(test_item, tree)
        actual_val = test_item[-1]
        correct = prediction == actual_val
        if i < 5:
            print(
                f'Correct: {correct} \t Actual Value: {actual_val} \t Predicted Value: {prediction}'
            )
        # compare prediction and real value
        if correct:
            correct_guesses += 1
    # show classification accuracy
    print(f'Accuracy: {round(correct_guesses/total_tests,2)} \n')
コード例 #7
0
ファイル: tree.py プロジェクト: csammcgrath/CS450
    def __init__(self, data, targets, headers):
        self.data = data
        self.targets = targets
        self.headers = headers

        #merge the data and targets into one dataframe so it can be used to
        #build the tree.
        frames = [data, targets]
        final_data = pd.concat(frames, axis=1)

        self.tree = watdt.build_tree(final_data, headers[:-1])
コード例 #8
0
def get_risks(records):
    # UWAGA! Przy każdym uruchomienu trenowane jest drzewo - jeżeli
    # będzie to wolny proces, to można zapisać drzewo do pliku!
    path = str(Path(os.getcwd()).parent) + "/data/artif_data.txt"
    f = open(path, 'r')
    training_data = [line.rstrip().split(',') for line in f]
    header = training_data.pop(0)

    tree = build_tree(training_data)

    predicted_risks = {}

    for area in records:
        predicted_risks[area] = list(classify(records[area], tree).keys())[0]

    return predicted_risks
コード例 #9
0
def run_tests(df, df_training, labels):
    for m in dt.Measure:
        for i in range(1, 5):
            tree_depth = i
            min_split = 1
            test_set = df.values
            measure = m
            tree = dt.build_tree(df_training.values,
                                 max_depth=tree_depth,
                                 min_size=min_split,
                                 measure=measure)
            print("=" * 40)
            dt.print_tree(tree, labels)
            print('Min split:   {}'.format(min_split))
            print('Tree depth:  {}'.format(tree_depth))
            print('Train Size:  {}'.format(len(df_training)))
            print('Test Size:   {}'.format(len(test_set)))
            print('Accuracy:    {:.4f}'.format(dt.accuracy(test_set, tree)))
            print('Measure:     {}'.format(measure))
            print("=" * 40)
コード例 #10
0
def main():
    #Set display option for data frames
    pd.set_option('display.max_columns', 11)
    pd.set_option('display.width', 200)

    #Read data and remove garbage
    df = pd.read_csv('winequalityN.csv')
    df = dt.remove_garbage(
        pd.DataFrame(data=df, columns=list(df.columns.values)))
    cols = df.columns.tolist()
    cols = cols[1:] + cols[0:1]  #Move wine color column to last column
    #df = df[cols]
    df = df[cols].drop(['total sulfur dioxide'], axis='columns')
    labels = df.columns.values

    #Extract training data, sample size n
    df_white = df[(df['type'] == 0.0)]
    df_red = df[(df['type'] == 1.0)]
    df_training = df.sample(n=100, random_state=1)  #Mixed sample

    # run_tests(df, df_training, labels)

    tree_depth = 3
    min_split = 1
    test_set = df.values
    measure = dt.Measure.GINI
    tree = dt.build_tree(df_training.values,
                         max_depth=tree_depth,
                         min_size=min_split,
                         measure=measure)
    print("=" * 40)
    dt.print_tree(tree, labels)
    print('Min split:   {}'.format(min_split))
    print('Tree depth:  {}'.format(tree_depth))
    print('Train Size:  {}'.format(len(df_training)))
    print('Test Size:   {}'.format(len(test_set)))
    print('Accuracy:    {:.4f}'.format(dt.accuracy(test_set, tree)))
    print('Measure:     {}'.format(measure))
    print("=" * 40)
    dt.prune_tree(tree)
    dt.print_tree(tree, labels)
コード例 #11
0
def pushTrainTreeButton(ui):
    # 按下创建决策树按钮
    # 提示加载数据
    ui.tipBrowser.append("start loading train data...")

    load_tree_file = str(ui.trainTreeLine.text())
    dt_gui.columns, data = dt.load_data(load_tree_file)
    train_columns = dt_gui.columns[:]
    dt_gui.tree = dt.build_tree(data, train_columns)
    print(dt_gui.tree)

    # 显示决策树
    tree_content_list = []
    dt.tree_content(dt_gui.tree, dt_gui.columns, tree_content_list)
    ui.treeShow.setText("")  # 先清空
    tree_str = ""
    for l in tree_content_list:
        tree_str += (l + "\n")
    ui.treeShow.setText(tree_str)

    # 提示创建成功
    ui.tipBrowser.append("train tree success")
    print("完成决策树训练")
コード例 #12
0
def alternative_classifier(train_set, train_labels, test_set, test_labels,
                           **kwargs):
    pred_set = []

    train_set_red, test_set_red = reduce_data(train_set, test_set, [9, 12])

    train_data = np.insert(train_set_red, 2, train_labels, axis=1)
    test_data = np.insert(test_set_red, 2, test_labels, axis=1)

    tree = build_tree(train_data)

    for row in test_data:
        prediction = classify(row, tree)
        pred_set.append(prediction)

    accuracy = calculate_accuracy(test_labels, pred_set)
    print(accuracy)

    confusionMatrix = calculate_confusion_matrix(test_labels, pred_set)
    plot_matrix(confusionMatrix)
    plt.show()

    return pred_set
コード例 #13
0
ファイル: Project4.py プロジェクト: asgray/ML-Class
def five_fold_validation(data_set):
    # split the datasets into fifths
    splits = u.five_fold_split(data_set)
    errors = []
    # for each fifth of the dataset
    for split in splits:
        test_set = None
        training_set = pd.DataFrame(columns=data_set.columns.values)
        # check each fifth
        for s in splits:
            # if fifth in question
            if s == split:
                # this fifth is test set
                test_set = splits[s]
            # all others are training sets
            else:
                training_set = training_set.append(splits[s], sort=False)

        # construct tree with training set
        tree = dt.build_tree(training_set)
        pprint(tree)
        print(test_set)
        # number of values in test set
        total_tests = test_set.shape[0]
        correct_guesses = 0
        for i in range(total_tests):
            # test each item
            test_item = test_set.iloc[i, :]
            prediction = dt.make_prediction(test_item, tree)
            actual_val = test_item[-1]
            # compare prediction and real value
            if prediction == actual_val:
                correct_guesses += 1
        errors.append(correct_guesses / total_tests)
    # retrn average error
    return sum(errors) / len(errors)
コード例 #14
0
ファイル: test_decision_tree.py プロジェクト: egnartsms/card
def test_rival_puts_result_draw(set_trump):
    set_trump('hearts')
    node = build_tree([Card('clubs', 10)], [Card('clubs', 9)], False)
    assert node_estimate(node) == 0.5
コード例 #15
0
 def test_build_tree(self):
     node = build_tree(train_instances, train_labels,
                       range(len(train_instances[0])))
     print node
コード例 #16
0
    if not os.path.exists(saved_filename):
        # If we don't have a saved tree, save it
        all_feats = []
        for i in range(len(text)):
            feat = make_features(text[i], phones[i])
            all_feats.extend(feat)
        idx = list(range(len(all_feats)))

        random_state = np.random.RandomState(1999)
        random_state.shuffle(idx)
        # Out of 900k samples... but scaling is poor
        num_samples = 10000
        idx = idx[:num_samples]
        all_feats = [all_feats[i] for i in idx]
        # Let max leaves be > number of phones (44)
        tree = build_tree(all_feats, max_depth=50)
        dump_tree_to_json(tree, saved_filename)

    tree = load_tree_from_json(saved_filename)

    if len(sys.argv) > 1:
        pred_text = list(sys.argv[1:])
        pred_text = [t.upper() for t in pred_text]
    else:
        pred_text = ["HEISENBERG"]
    all_wav = []
    for pt in pred_text:
        # quality of life hacks for simple words
        if pt == "I":
            print("Replacing I -> EYE")
            pt = "EYE"
コード例 #17
0
ファイル: test_decision_tree.py プロジェクト: egnartsms/card
def test_i_put_rival_wins(set_trump):
    set_trump('hearts')
    node = build_tree([Card('clubs', 10), Card('diamonds', 8)],
                      [Card('hearts', 6)],
                      True)
    assert node_estimate(node) == 0.0
コード例 #18
0
import numpy as np
import decision_tree as DT
import perform_eval as PE
import kfold as KF
import time

start_time = time.time()
depth = 4
accuracy = []
precision = []
recall = []
f_1 = []
for i in range(len(KF.res)):
    my_tree = DT.build_tree(KF.res[i].train_set, depth)
    test_label = []
    for row in KF.res[i].test_set:
        test_label.append(row[-1])
    print(
        '--------------------------------------------round: %i------------------------------------------------'
        % (i))
    # DT.print_tree(my_tree, "")
    # print("original labels:", test_label)
    predict = []
    count = 0
    for j in range(len(KF.res[i].test_set)):
        predict.append(DT.predict_val(KF.res[i].test_set[j], my_tree))
    # print("predicted labels:", predict)
    a, p, r, F = PE.evaluate(predict, test_label)
    accuracy.append(a)
    precision.append(p)
    recall.append(r)
コード例 #19
0
from decision_tree import get_header
from decision_tree import set_header
from decision_tree import get_unique_values
import csv

training_data = []

with open('data.csv', encoding="utf8") as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    for row in readCSV:
        new_row = []
        for item in row[0].split(','):
            new_row.append(item)
        training_data.append(new_row)

my_tree = build_tree(training_data)

print_tree(my_tree)
print()

testing_data = []

for i in range(len(get_header()) - 1):
    ask = 'Введіть ' + str(get_header()[i]) + str(
        get_unique_values(training_data, i)) + ': '
    user_input = input(ask)
    testing_data.append(user_input)

print("Передбачено: %s" % (print_leaf(classify(testing_data, my_tree))))

input()
コード例 #20
0
ファイル: main.py プロジェクト: sdvillegab/ST0245-Eafit
def main():
    headers, data_set = read_dataset("../csv_data/data_set.csv")
    my_tree = build_tree(data_set, headers)
    #print_tree(my_tree)
    print(
        print_leaf(classify([6.44, 21.0, 65.22, 1431.0, 19.0, 99.0], my_tree)))
コード例 #21
0
from decision_tree import group_by_fn

from scipy import stats

import pandas as pd

income_train = pd.read_csv('../dataset/adult.data', header=None).values.tolist()
income_test = pd.read_csv('../dataset/adult.test', header=None).values.tolist()


# categorial data
def workclass(x) : return x[1]
def edu(x) : return x[3]
def marital_status(x) : return x[4]
def occupation(x) : return x[5]
def relationship(x) : return x[6]
def race(x) : return x[7]
def sex(x) : return x[8]
def native_country(x) : return x[12]

def income(x): return x[14]

attrfns = [workclass, edu, marital_status, occupation, relationship,
        race, sex, native_country]


root = build_tree(income_train, attrfns, income, (' <=50K', ' >50K'), chi_split)
print("Created decision tree with {0} nodes, depth {1}".format(count_nodes(root), depth(root)))
print(accuracy(root, income_test, income))

コード例 #22
0
ファイル: test_decision_tree.py プロジェクト: egnartsms/card
def test_rival_puts_i_win(set_trump):
    set_trump('hearts')
    node = build_tree([Card('clubs', 10)],
                      [Card('clubs', 9), Card('clubs', 6)],
                      False)
    assert node_estimate(node) == 1.0
コード例 #23
0
from decision_tree import depth
from decision_tree import gain
from decision_tree import group_by_fn


import pandas as pd

# https://archive.ics.uci.edu/ml/datasets/Statlog+(Australian+Credit+Approval)
aust_data = pd.read_csv('../dataset/australian.dat', header=None, delimiter=' ')
shuffled_data = aust_data.sample(frac=1).reset_index(drop=True)

# categorial data
def zero(x): return x[0]
def three(x): return x[3]
def four(x): return x[4]
def five(x): return x[5]
def seven(x): return x[7]
def eight(x): return x[8]
def ten(x): return x[10]
def eleven(x): return x[11]

attrfns = [zero, three, four, five, seven, eight, ten, eleven]

def classfn(x): return int(x[14])

x = shuffled_data[:414].values.tolist()
test_data = shuffled_data[414:].values.tolist()
root = build_tree(x, attrfns, classfn, (0,1), chi_split)
print("Created decision tree with {0} nodes, depth {1}".format(count_nodes(root), depth(root)))
print(accuracy(root, test_data, classfn))
コード例 #24
0
    return x[6]


def eight(x):
    return x[8]


def ninth(x):
    return x[9]


def eleventh(x):
    return x[11]


def twelveth(x):
    return x[12]


cc_att_fns = [
    zeroth, third, fourth, fifth, sixth, eight, ninth, eleventh, twelveth
]
x = shuffled_data[:414].values.tolist()

test_data = shuffled_data[414:].values.tolist()

root = build_tree(x, cc_att_fns, cc_class, ('+', '-'), chi_split)
print("Created decision tree with {0} nodes, depth {1}".format(
    count_nodes(root), depth(root)))
print(accuracy(root, test_data, cc_class))
コード例 #25
0
ファイル: engine.py プロジェクト: Ding-Flash/qwc_trace
        feature_.pop('task_duration')
        feature_.pop('read_from_hdfs')
        feature_.pop('records_read')
        feature_.pop('input_bytes/result_bytes')
        feature_.pop('shuffle_read')
        feature_.pop('bytes_per_record')
        feature_.pop('remote_fetch')
        feature_.pop('shuffle_write')
        feature_.pop('write_bytes_per_record')
        feature_.pop('write_bytes/read_bytes')
        labels.append(label)
        row = []
        for key in feature_:
            if flag_key:
                keys.append(key)
            row.append(feature_[key])
        flag_key = False
        dataset.append(row)

    accuracy, precision, recall = decision_tree.build_tree(
        dataset, labels, keys)
    print('accuracy,precision,recall=', accuracy, precision, recall)
    exit()
    # clean dataset
    feature_values = {}
    for key in dataset[0][0]:
        feature_values[key] = []
        for piece in dataset:
            piece = piece[0]
            feature_values.append(piece[key])