def write_file(suffix: str, recoverable: bool = True, readable: bool = False): # set tree name and get data. df_train, df_val, df_test = dt_util.init_dt_data(suffix) # make the tree. tree = DecisionTree(data=df_train + df_val, max_depth=5, min_node_size=50, var_types=dt_util.var_types) root_node = tree.split(node=tree.root_node) # display tree to console. dtns = NodeStorage(root_node=root_node, fname=dt_util.get_filename(suffix), header=dt_util.header, var_types=dt_util.var_types) dtns.print_tree_preorder(node=root_node) if recoverable: # write tree to file (recoverable). dtns.tree_to_file(root_node=root_node) if readable: # write tree to file (readable). test_pred = tree.predict_list(examples=df_test, root_node=root_node) acc = dt_util.accuracy(dt_util.get_labels(df_test), test_pred) dtns.tree_to_file_readable(root_node=root_node, acc=acc)
def read_file(fname: str, write_readable: bool = False, pos_data: List[str] = None) -> Node: # read recoverable file and return the root node. dtns = NodeStorage(fname=fname, header=dt_util.get_header(), var_types=dt_util.var_types) root_node = dtns.file_to_tree() # print tree to console to make sure it worked. dtns.print_tree_preorder(root_node) # make the tree object so we can use it for predictions. tree = DecisionTree(root_node=root_node) if write_readable: # write tree to file (readable). can't get accuracy without the data. # use the filename to figure out which testing data to use for pred/acc. for suffix in pos_data: if suffix in fname: df_test = dt_util.init_dt_data(suffix)[2] test_pred = tree.predict_list(examples=df_test, root_node=root_node) acc = dt_util.accuracy(dt_util.get_labels(df_test), test_pred) break dtns.tree_to_file_readable(root_node=root_node, acc=acc) return tree
# prompt a guess, and record accuracy. import dt_util import sys import random # prompt the user for which testing set to use data_options = ["full_cat", "seg_cat", "full", "seg", "big_cat"] if len(sys.argv) < 2 or str(sys.argv[1]) not in data_options: print("Expecting argument in ", data_options) exit() suffix = str(sys.argv[1]) # get the testing data and print the header df_test = dt_util.init_dt_data(str(suffix))[2] #print("\nVariables are ", dt_util.header[0:-1]) # store labels and predictions y, p = [], [] for i in range(len(df_test)): # generate prediction at random guess = random.randint(0, 1) # add it and the label to their lists y.append(int(df_test[i][-1])) p.append(guess) # check the accuracy and give the number of items tested print("Accuracy is " + str(dt_util.accuracy(y, p))) print("over " + str(len(p)) + " predictions.")
# specify necessary arguments passed from command line print("Expecting suffix argument: full_cat, seg_cat, full, or seg.") suffix = str(sys.argv[1]) # set tree name and get data df_train, df_val, df_test = dt_util.init_dt_data(suffix) # initialize the tree with the training data. # now that we have finalized the tree, train with both training & validation data tree = DecisionTree(data=df_train+df_val, max_depth=5, min_node_size=50, var_types=dt_util.var_types) print("tree initialization finished") root_node = tree.split(node=tree.root_node) print("tree split finished") # display and store the tree dtns = NodeStorage(root_node=root_node, fname=dt_util.get_filename(suffix), header=dt_util.get_header(), var_types=dt_util.var_types) print("dtns initialization finished") print("\nPrinting tree preorder") dtns.print_tree_preorder(node=root_node) #print("Printing tree inorder") #dtns.print_tree_inorder(node=root_node) print("\nComputing the accuracy") test_labels = dt_util.get_labels(df_test) test_pred = tree.predict_list(examples=df_test,root_node=root_node) acc = dt_util.accuracy(test_labels,test_pred) print("Accuracy is " + str(acc) + "\n") print("attempting to write to file") dtns.tree_to_file_readable(root_node=root_node, acc=acc) print("finished writing tree to file")