def ada_boost_experiment(): examples = ID3.data_parsing(FILE_PATH_TRAIN, numeric_cols) # hypothesis = AdaBoost.ada_boost(examples, 5, numeric_cols, missing_identifier) # print(hypothesis) iterations = 100 hypothesis = AdaBoost.ada_boost(examples, iterations, numeric_cols, missing_identifier) ada_results_train = AdaBoost.test_ada_boost_hypothesis( hypothesis, FILE_PATH_TRAIN, numeric_cols, missing_identifier) ada_results_test = AdaBoost.test_ada_boost_hypothesis( hypothesis, FILE_PATH_TEST, numeric_cols, missing_identifier) # for t in range(iterations): # print("AdaBoost Training Set - t:", t, "results:", ada_results_train[t], # "{0:.2%}".format(1-ada_results_train[t][0]/ada_results_train[t][1])) # for t in range(iterations): # print("AdaBoost Testing Set - t:", t, "results:", ada_results_test[t], # "{0:.2%}".format(1-ada_results_test[t][0]/ada_results_test[t][1])) # for t in range(iterations): # tree_results = ID3.test_tree(hypothesis[t][0],FILE_PATH_TRAIN, numeric_cols, missing_identifier) # print("Decision Tree Training Set - t:", t, "results:", tree_results, # "{0:.2%}".format(1 - tree_results[0] / tree_results[1])) # for t in range(iterations): # tree_results = ID3.test_tree(hypothesis[t][0],FILE_PATH_TEST, numeric_cols, missing_identifier) # print("Decision Tree Test Set - t:", t, "results:", tree_results, # "{0:.2%}".format(1 - tree_results[0] / tree_results[1])) ada_train = [] ada_test = [] dec_train = [] dec_test = [] for t in range(iterations): ada_train.append(1 - ada_results_train[t][0] / ada_results_train[t][1]) ada_test.append(1 - ada_results_test[t][0] / ada_results_test[t][1]) tree_results = ID3.test_tree(hypothesis[t][0], FILE_PATH_TRAIN, numeric_cols, missing_identifier) dec_train.append(1 - tree_results[0] / tree_results[1]) tree_results = ID3.test_tree(hypothesis[t][0], FILE_PATH_TEST, numeric_cols, missing_identifier) dec_test.append(1 - tree_results[0] / tree_results[1]) ada_graph = [ tuple([ada_train, "AdaBoost Train"]), tuple([ada_test, "AdaBoost Test"]) ] GraphUtility.graph(ada_graph, "AdaBoost Data", "Iterations", "Error") tree_graph = [ tuple([dec_train, "Tree Train"]), tuple([dec_test, "Tree Test"]) ] GraphUtility.graph(tree_graph, "Decision Tree Data", "Iterations", "Error")
def credit_experiment(): file_path = "/home/john/PycharmProjects/u1201441_Private_Repository/CS6350_Files/HW2/credit/default of credit card clients.csv" numeric_cols = [0, 4, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22] missing_identifier = None training_data = [] test_data = [] data = ID3.data_parsing(file_path, numeric_cols) LABEL_INDEX = len(data[0]) - 2 for instance in data: if instance[LABEL_INDEX] == '1': instance[LABEL_INDEX] = "yes" else: instance[LABEL_INDEX] = "no" test_indices = random.sample(range(len(data)), len(data)) for i in test_indices: if i < 6000: test_data.append(data[i]) else: training_data.append(data[i]) iterations = 100 decision_tree = ID3.build_decision_tree( training_data, max_depth=-1, info_gain_type=1, numeric_cols=numeric_cols, missing_identifier=missing_identifier) adaboost = AdaBoost.ada_boost(training_data, iterations=iterations, numeric_cols=numeric_cols, missing_identifier=missing_identifier) bagged_tree = BaggedTrees.bagged_trees( training_data, iterations=iterations, sample_size=100, numeric_cols=numeric_cols, missing_identifier=missing_identifier) forest = RandomForest.random_forest(training_data, iterations=iterations, sample_size=100, numeric_cols=numeric_cols, missing_identifier=missing_identifier, feature_size=4) # Decision Tree results tree_results = ID3.test_tree(decision_tree, training_data, numeric_cols, missing_identifier) tree_train = 1 - tree_results[0] / tree_results[1] tree_results = ID3.test_tree(decision_tree, test_data, numeric_cols, missing_identifier) tree_test = 1 - tree_results[0] / tree_results[1] tree_train_ln = [] tree_test_ln = [] for t in range(iterations): tree_train_ln.append(tree_train) tree_test_ln.append(tree_test) # AdaBoost results ada_results_train = AdaBoost.test_ada_boost_hypothesis( adaboost, training_data, numeric_cols, missing_identifier) ada_results_test = AdaBoost.test_ada_boost_hypothesis( adaboost, test_data, numeric_cols, missing_identifier) ada_train = [] ada_test = [] for t in range(iterations): ada_train.append(1 - ada_results_train[t][0] / ada_results_train[t][1]) ada_test.append(1 - ada_results_test[t][0] / ada_results_test[t][1]) ada_graph = [ tuple([ada_train, "AdaBoost Train"]), tuple([ada_test, "AdaBoost Test"]), tuple([tree_train_ln, "Tree Train"]), tuple([tree_test_ln, "Tree Test"]) ] GraphUtility.graph(ada_graph, "AdaBoost Data", "Iterations", "Error") # Bagging results results_train = BaggedTrees.test_bagged_tree_hypothesis( bagged_tree, training_data, numeric_cols, missing_identifier) results_test = BaggedTrees.test_bagged_tree_hypothesis( bagged_tree, test_data, numeric_cols, missing_identifier) # Charts bag_train = [] bag_test = [] for t in range(iterations): bag_train.append(1 - results_train[t][0] / results_train[t][1]) bag_test.append(1 - results_test[t][0] / results_test[t][1]) bag_graph = [ tuple([bag_train, "Bagging Train"]), tuple([bag_test, "Bagging Test"]), tuple([tree_train_ln, "Tree Train"]), tuple([tree_test_ln, "Tree Test"]) ] GraphUtility.graph(bag_graph, "Bagged Tree Data", "Num Trees", "Error") # Forest Results results_train = RandomForest.test_random_forest_hypothesis( forest, training_data, numeric_cols, missing_identifier) results_test = RandomForest.test_random_forest_hypothesis( forest, test_data, numeric_cols, missing_identifier) # Charts forest_train = [] forest_test = [] for t in range(iterations): forest_train.append(1 - results_train[t][0] / results_train[t][1]) forest_test.append(1 - results_test[t][0] / results_test[t][1]) forest_graph = [ tuple([forest_train, "Forest Train - " + str(2) + " features"]), tuple([forest_test, "Forest Test - " + str(2) + " features"]), tuple([tree_train_ln, "Tree Train"]), tuple([tree_test_ln, "Tree Test"]) ] GraphUtility.graph(forest_graph, "Random Forest Data", "Num Trees", "Error")