Beispiel #1
0
def remove_features(removal_order, train_file, test_file, attr_file,
                    max_features):
    train_accs = []
    test_accs = []
    remove_columns = []
    for col in removal_order:
        print(col)
        remove_columns.append(col)
        if len(remove_columns) == max_features: break
        print(remove_columns)
        train_data, train_attr = read_data(train,
                                           attr,
                                           remove_columns=remove_columns)
        test_data, test_attr = read_data(test,
                                         attr,
                                         remove_columns=remove_columns)
        tree = decision_tree.DecisionTreeLearning(train_data, train_attr,
                                                  "normal", "class")

        decision_tree.print_tree(tree)
        y_pred, y_true = decision_tree.predict(train_data, tree)
        train_acc = decision_tree.accuracy_score(y_pred, y_true)
        print('Accuracy on Training Data: {0}'.format(train_acc * 100))
        y_pred, y_true = decision_tree.predict(test_data, tree)
        test_acc = decision_tree.accuracy_score(y_pred, y_true)
        print('Accuracy on Training Data: {0}'.format(test_acc * 100))

        train_accs.append(train_acc)
        test_accs.append(test_acc)
    return train_accs, test_accs
Beispiel #2
0
def menu_1():
    print("\nType the filename you would like to run the classifier on")
    filename = raw_input(" >>  ")
    print("\nPlease type the filename with the types of classifiers listed")
    label_file = raw_input(" >>  ")

    prepped_data = prep_data(filename, label_file)

    tree = decision_tree.build_decision_tree(prepped_data)
    decision_tree.print_tree(tree, 100)
Beispiel #3
0
def main():
    train_file = 'train.txt'
    test_file = 'test.txt'
    bayes_accuracy = naive_bayes(train_file, test_file)
    knn_accuracy = knn(train_file, test_file, k=5)
    dt_accuracy, tree = decision_tree(train_file, test_file)
    with open('output.txt', 'w') as f:
        print_tree(tree, f)
        f.write('\n{}\t{}\t{}\n'.format(round(dt_accuracy, 2),
                                        round(knn_accuracy, 2),
                                        round(bayes_accuracy, 2)))
Beispiel #4
0
def run_tests(df, df_training, labels):
    for m in dt.Measure:
        for i in range(1, 5):
            tree_depth = i
            min_split = 1
            test_set = df.values
            measure = m
            tree = dt.build_tree(df_training.values,
                                 max_depth=tree_depth,
                                 min_size=min_split,
                                 measure=measure)
            print("=" * 40)
            dt.print_tree(tree, labels)
            print('Min split:   {}'.format(min_split))
            print('Tree depth:  {}'.format(tree_depth))
            print('Train Size:  {}'.format(len(df_training)))
            print('Test Size:   {}'.format(len(test_set)))
            print('Accuracy:    {:.4f}'.format(dt.accuracy(test_set, tree)))
            print('Measure:     {}'.format(measure))
            print("=" * 40)
Beispiel #5
0
def main():
    #Set display option for data frames
    pd.set_option('display.max_columns', 11)
    pd.set_option('display.width', 200)

    #Read data and remove garbage
    df = pd.read_csv('winequalityN.csv')
    df = dt.remove_garbage(
        pd.DataFrame(data=df, columns=list(df.columns.values)))
    cols = df.columns.tolist()
    cols = cols[1:] + cols[0:1]  #Move wine color column to last column
    #df = df[cols]
    df = df[cols].drop(['total sulfur dioxide'], axis='columns')
    labels = df.columns.values

    #Extract training data, sample size n
    df_white = df[(df['type'] == 0.0)]
    df_red = df[(df['type'] == 1.0)]
    df_training = df.sample(n=100, random_state=1)  #Mixed sample

    # run_tests(df, df_training, labels)

    tree_depth = 3
    min_split = 1
    test_set = df.values
    measure = dt.Measure.GINI
    tree = dt.build_tree(df_training.values,
                         max_depth=tree_depth,
                         min_size=min_split,
                         measure=measure)
    print("=" * 40)
    dt.print_tree(tree, labels)
    print('Min split:   {}'.format(min_split))
    print('Tree depth:  {}'.format(tree_depth))
    print('Train Size:  {}'.format(len(df_training)))
    print('Test Size:   {}'.format(len(test_set)))
    print('Accuracy:    {:.4f}'.format(dt.accuracy(test_set, tree)))
    print('Measure:     {}'.format(measure))
    print("=" * 40)
    dt.prune_tree(tree)
    dt.print_tree(tree, labels)
test_fraction = 0.25
max_depth = 2
min_sample_per_node = 2
criterion = 'gini'
prediction_type = 'classification'

# load dataset
X, Y, X_feature_names = common_fns.get_data(filename=filename, target=target)

# random split into test and train
X_train, Y_train, X_test, Y_test = common_fns.split_train_test(
    X, Y, test_fraction=test_fraction)

# fit tree on train set
tree = dtree.fit_decision_tree(X=X_train,
                               Y=Y_train,
                               max_depth=max_depth,
                               min_sample_per_node=min_sample_per_node,
                               criterion=criterion)

# print tree for debugging/reference
dtree.print_tree(tree=tree, X_feature_names=X_feature_names)

# make predictions on test set
Y_predict = dtree.predict_all(tree=tree,
                              X=X_test,
                              prediction_type=prediction_type)

# calculate various classification scores
dtree.calculate_scores(Y_predict=Y_predict, Y_ref=Y_test)
Beispiel #7
0
 def __repr__(self):
     #the reason this is formatted like this is because Python doesn't like
     #printing with no returns. This basically calls the function that prints everything
     #then prints None. The way this is formatted helps the None from being printed.
     return '' if str(watdt.print_tree(self.tree)) == None else ''
Beispiel #8
0
from decision_tree import get_header
from decision_tree import set_header
from decision_tree import get_unique_values
import csv

training_data = []

with open('data.csv', encoding="utf8") as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    for row in readCSV:
        new_row = []
        for item in row[0].split(','):
            new_row.append(item)
        training_data.append(new_row)

my_tree = build_tree(training_data)

print_tree(my_tree)
print()

testing_data = []

for i in range(len(get_header()) - 1):
    ask = 'Введіть ' + str(get_header()[i]) + str(
        get_unique_values(training_data, i)) + ': '
    user_input = input(ask)
    testing_data.append(user_input)

print("Передбачено: %s" % (print_leaf(classify(testing_data, my_tree))))

input()