Ejemplo n.º 1
0
def ada_boost_experiment():
    examples = ID3.data_parsing(FILE_PATH_TRAIN, numeric_cols)

    # hypothesis = AdaBoost.ada_boost(examples, 5, numeric_cols, missing_identifier)

    # print(hypothesis)

    iterations = 100

    hypothesis = AdaBoost.ada_boost(examples, iterations, numeric_cols,
                                    missing_identifier)

    ada_results_train = AdaBoost.test_ada_boost_hypothesis(
        hypothesis, FILE_PATH_TRAIN, numeric_cols, missing_identifier)
    ada_results_test = AdaBoost.test_ada_boost_hypothesis(
        hypothesis, FILE_PATH_TEST, numeric_cols, missing_identifier)

    # for t in range(iterations):
    #     print("AdaBoost Training Set - t:", t, "results:", ada_results_train[t],
    #           "{0:.2%}".format(1-ada_results_train[t][0]/ada_results_train[t][1]))
    # for t in range(iterations):
    #     print("AdaBoost Testing Set - t:", t, "results:", ada_results_test[t],
    #           "{0:.2%}".format(1-ada_results_test[t][0]/ada_results_test[t][1]))
    # for t in range(iterations):
    #     tree_results = ID3.test_tree(hypothesis[t][0],FILE_PATH_TRAIN, numeric_cols, missing_identifier)
    #     print("Decision Tree Training Set - t:", t, "results:", tree_results,
    #           "{0:.2%}".format(1 - tree_results[0] / tree_results[1]))
    # for t in range(iterations):
    #     tree_results = ID3.test_tree(hypothesis[t][0],FILE_PATH_TEST, numeric_cols, missing_identifier)
    #     print("Decision Tree Test Set - t:", t, "results:", tree_results,
    #           "{0:.2%}".format(1 - tree_results[0] / tree_results[1]))

    ada_train = []
    ada_test = []
    dec_train = []
    dec_test = []

    for t in range(iterations):
        ada_train.append(1 - ada_results_train[t][0] / ada_results_train[t][1])
        ada_test.append(1 - ada_results_test[t][0] / ada_results_test[t][1])
        tree_results = ID3.test_tree(hypothesis[t][0], FILE_PATH_TRAIN,
                                     numeric_cols, missing_identifier)
        dec_train.append(1 - tree_results[0] / tree_results[1])
        tree_results = ID3.test_tree(hypothesis[t][0], FILE_PATH_TEST,
                                     numeric_cols, missing_identifier)
        dec_test.append(1 - tree_results[0] / tree_results[1])

    ada_graph = [
        tuple([ada_train, "AdaBoost Train"]),
        tuple([ada_test, "AdaBoost Test"])
    ]
    GraphUtility.graph(ada_graph, "AdaBoost Data", "Iterations", "Error")

    tree_graph = [
        tuple([dec_train, "Tree Train"]),
        tuple([dec_test, "Tree Test"])
    ]
    GraphUtility.graph(tree_graph, "Decision Tree Data", "Iterations", "Error")
Ejemplo n.º 2
0
def credit_experiment():

    file_path = "/home/john/PycharmProjects/u1201441_Private_Repository/CS6350_Files/HW2/credit/default of credit card clients.csv"

    numeric_cols = [0, 4, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
    missing_identifier = None
    training_data = []
    test_data = []

    data = ID3.data_parsing(file_path, numeric_cols)

    LABEL_INDEX = len(data[0]) - 2

    for instance in data:
        if instance[LABEL_INDEX] == '1':
            instance[LABEL_INDEX] = "yes"
        else:
            instance[LABEL_INDEX] = "no"

    test_indices = random.sample(range(len(data)), len(data))
    for i in test_indices:
        if i < 6000:
            test_data.append(data[i])
        else:
            training_data.append(data[i])

    iterations = 100

    decision_tree = ID3.build_decision_tree(
        training_data,
        max_depth=-1,
        info_gain_type=1,
        numeric_cols=numeric_cols,
        missing_identifier=missing_identifier)
    adaboost = AdaBoost.ada_boost(training_data,
                                  iterations=iterations,
                                  numeric_cols=numeric_cols,
                                  missing_identifier=missing_identifier)
    bagged_tree = BaggedTrees.bagged_trees(
        training_data,
        iterations=iterations,
        sample_size=100,
        numeric_cols=numeric_cols,
        missing_identifier=missing_identifier)
    forest = RandomForest.random_forest(training_data,
                                        iterations=iterations,
                                        sample_size=100,
                                        numeric_cols=numeric_cols,
                                        missing_identifier=missing_identifier,
                                        feature_size=4)

    # Decision Tree results

    tree_results = ID3.test_tree(decision_tree, training_data, numeric_cols,
                                 missing_identifier)
    tree_train = 1 - tree_results[0] / tree_results[1]
    tree_results = ID3.test_tree(decision_tree, test_data, numeric_cols,
                                 missing_identifier)
    tree_test = 1 - tree_results[0] / tree_results[1]

    tree_train_ln = []
    tree_test_ln = []

    for t in range(iterations):
        tree_train_ln.append(tree_train)
        tree_test_ln.append(tree_test)

    # AdaBoost results
    ada_results_train = AdaBoost.test_ada_boost_hypothesis(
        adaboost, training_data, numeric_cols, missing_identifier)
    ada_results_test = AdaBoost.test_ada_boost_hypothesis(
        adaboost, test_data, numeric_cols, missing_identifier)

    ada_train = []
    ada_test = []

    for t in range(iterations):
        ada_train.append(1 - ada_results_train[t][0] / ada_results_train[t][1])
        ada_test.append(1 - ada_results_test[t][0] / ada_results_test[t][1])

    ada_graph = [
        tuple([ada_train, "AdaBoost Train"]),
        tuple([ada_test, "AdaBoost Test"]),
        tuple([tree_train_ln, "Tree Train"]),
        tuple([tree_test_ln, "Tree Test"])
    ]

    GraphUtility.graph(ada_graph, "AdaBoost Data", "Iterations", "Error")

    # Bagging results
    results_train = BaggedTrees.test_bagged_tree_hypothesis(
        bagged_tree, training_data, numeric_cols, missing_identifier)
    results_test = BaggedTrees.test_bagged_tree_hypothesis(
        bagged_tree, test_data, numeric_cols, missing_identifier)

    # Charts
    bag_train = []
    bag_test = []

    for t in range(iterations):
        bag_train.append(1 - results_train[t][0] / results_train[t][1])
        bag_test.append(1 - results_test[t][0] / results_test[t][1])

    bag_graph = [
        tuple([bag_train, "Bagging Train"]),
        tuple([bag_test, "Bagging Test"]),
        tuple([tree_train_ln, "Tree Train"]),
        tuple([tree_test_ln, "Tree Test"])
    ]
    GraphUtility.graph(bag_graph, "Bagged Tree Data", "Num Trees", "Error")

    # Forest Results
    results_train = RandomForest.test_random_forest_hypothesis(
        forest, training_data, numeric_cols, missing_identifier)
    results_test = RandomForest.test_random_forest_hypothesis(
        forest, test_data, numeric_cols, missing_identifier)

    # Charts
    forest_train = []
    forest_test = []
    for t in range(iterations):
        forest_train.append(1 - results_train[t][0] / results_train[t][1])
        forest_test.append(1 - results_test[t][0] / results_test[t][1])

    forest_graph = [
        tuple([forest_train, "Forest Train - " + str(2) + " features"]),
        tuple([forest_test, "Forest Test - " + str(2) + " features"]),
        tuple([tree_train_ln, "Tree Train"]),
        tuple([tree_test_ln, "Tree Test"])
    ]
    GraphUtility.graph(forest_graph, "Random Forest Data", "Num Trees",
                       "Error")