Esempio n. 1
0
def write_file(suffix: str, recoverable: bool = True, readable: bool = False):
    # set tree name and get data.
    df_train, df_val, df_test = dt_util.init_dt_data(suffix)
    # make the tree.
    tree = DecisionTree(data=df_train + df_val,
                        max_depth=5,
                        min_node_size=50,
                        var_types=dt_util.var_types)
    root_node = tree.split(node=tree.root_node)
    # display tree to console.
    dtns = NodeStorage(root_node=root_node,
                       fname=dt_util.get_filename(suffix),
                       header=dt_util.header,
                       var_types=dt_util.var_types)
    dtns.print_tree_preorder(node=root_node)
    if recoverable:
        # write tree to file (recoverable).
        dtns.tree_to_file(root_node=root_node)
    if readable:
        # write tree to file (readable).
        test_pred = tree.predict_list(examples=df_test, root_node=root_node)
        acc = dt_util.accuracy(dt_util.get_labels(df_test), test_pred)
        dtns.tree_to_file_readable(root_node=root_node, acc=acc)
Esempio n. 2
0
def read_file(fname: str,
              write_readable: bool = False,
              pos_data: List[str] = None) -> Node:
    # read recoverable file and return the root node.
    dtns = NodeStorage(fname=fname,
                       header=dt_util.get_header(),
                       var_types=dt_util.var_types)
    root_node = dtns.file_to_tree()
    # print tree to console to make sure it worked.
    dtns.print_tree_preorder(root_node)
    # make the tree object so we can use it for predictions.
    tree = DecisionTree(root_node=root_node)
    if write_readable:
        # write tree to file (readable). can't get accuracy without the data.
        # use the filename to figure out which testing data to use for pred/acc.
        for suffix in pos_data:
            if suffix in fname:
                df_test = dt_util.init_dt_data(suffix)[2]
                test_pred = tree.predict_list(examples=df_test,
                                              root_node=root_node)
                acc = dt_util.accuracy(dt_util.get_labels(df_test), test_pred)
                break
        dtns.tree_to_file_readable(root_node=root_node, acc=acc)
    return tree
Esempio n. 3
0
# prompt a guess, and record accuracy.

import dt_util
import sys
import random

# prompt the user for which testing set to use
data_options = ["full_cat", "seg_cat", "full", "seg", "big_cat"]
if len(sys.argv) < 2 or str(sys.argv[1]) not in data_options:
    print("Expecting argument in ", data_options)
    exit()
suffix = str(sys.argv[1])

# get the testing data and print the header
df_test = dt_util.init_dt_data(str(suffix))[2]
#print("\nVariables are ", dt_util.header[0:-1])

# store labels and predictions
y, p = [], []

for i in range(len(df_test)):
    # generate prediction at random
    guess = random.randint(0, 1)
    # add it and the label to their lists
    y.append(int(df_test[i][-1]))
    p.append(guess)

# check the accuracy and give the number of items tested
print("Accuracy is " + str(dt_util.accuracy(y, p)))
print("over " + str(len(p)) + " predictions.")
Esempio n. 4
0
# specify necessary arguments passed from command line
print("Expecting suffix argument: full_cat, seg_cat, full, or seg.")
suffix = str(sys.argv[1])
# set tree name and get data
df_train, df_val, df_test = dt_util.init_dt_data(suffix)

# initialize the tree with the training data.
# now that we have finalized the tree, train with both training & validation data
tree = DecisionTree(data=df_train+df_val, max_depth=5, min_node_size=50, var_types=dt_util.var_types)
print("tree initialization finished")
root_node = tree.split(node=tree.root_node)
print("tree split finished")

# display and store the tree
dtns = NodeStorage(root_node=root_node, fname=dt_util.get_filename(suffix), header=dt_util.get_header(), var_types=dt_util.var_types)
print("dtns initialization finished")
print("\nPrinting tree preorder")
dtns.print_tree_preorder(node=root_node)
#print("Printing tree inorder")
#dtns.print_tree_inorder(node=root_node)
print("\nComputing the accuracy")
test_labels = dt_util.get_labels(df_test)
test_pred = tree.predict_list(examples=df_test,root_node=root_node)
acc = dt_util.accuracy(test_labels,test_pred)
print("Accuracy is " + str(acc) + "\n")
print("attempting to write to file")
dtns.tree_to_file_readable(root_node=root_node, acc=acc)
print("finished writing tree to file")