Ejemplo n.º 1
0
def run_glass(filename, target_class, class_wanted, glass_names):
    # Setup data
    glass_obj = glass.Glass()
    glass_data = glass_obj.setup_data_glass(filename=filename,
                                            target_class=target_class,
                                            class_wanted=class_wanted,
                                            glass_names=glass_names)

    # Setup five fold cross validation
    five_fold = ff.FiveFold()
    glass1, glass2, glass3, glass4, glass5 = five_fold.five_fold_sort_class(
        data=glass_data, sortby=target_class)

    glass_nb1 = nb_glass(glass_data=glass1, target_class=target_class)
    glass_nb2 = nb_glass(glass_data=glass1, target_class=target_class)
    glass_nb3 = nb_glass(glass_data=glass1, target_class=target_class)
    glass_nb4 = nb_glass(glass_data=glass1, target_class=target_class)
    glass_nb5 = nb_glass(glass_data=glass1, target_class=target_class)

    nb_perf = [glass_nb1, glass_nb2, glass_nb3, glass_nb4, glass_nb5]

    glass_lr1 = perform_lr(glass1)
    glass_lr2 = perform_lr(glass2)
    glass_lr3 = perform_lr(glass3)
    glass_lr4 = perform_lr(glass4)
    glass_lr5 = perform_lr(glass5)

    lr_perf = [glass_lr1, glass_lr2, glass_lr3, glass_lr4, glass_lr5]

    return nb_perf, lr_perf
Ejemplo n.º 2
0
def run_votes(filename, target_class, class_wanted, vote_names):
    # Setup data
    votes_obj = votes.HouseVotes()
    votes_data = votes_obj.setup_data_votes(filename=filename,
                                            target_class=target_class,
                                            class_wanted=class_wanted,
                                            vote_names=vote_names)
    # Setup five fold cross validation
    five_fold = ff.FiveFold()
    votes1, votes2, votes3, votes4, votes5 = five_fold.five_fold_sort_class(
        data=votes_data, sortby=target_class)

    votes_nb1 = nb_votes(votes_data=votes1, target_class=target_class)
    votes_nb2 = nb_votes(votes_data=votes1, target_class=target_class)
    votes_nb3 = nb_votes(votes_data=votes1, target_class=target_class)
    votes_nb4 = nb_votes(votes_data=votes1, target_class=target_class)
    votes_nb5 = nb_votes(votes_data=votes1, target_class=target_class)

    nb_perf = [votes_nb1, votes_nb2, votes_nb3, votes_nb4, votes_nb5]

    votes_lr1 = perform_lr(votes1)
    votes_lr2 = perform_lr(votes2)
    votes_lr3 = perform_lr(votes3)
    votes_lr4 = perform_lr(votes4)
    votes_lr5 = perform_lr(votes5)

    lr_perf = [votes_lr1, votes_lr2, votes_lr3, votes_lr4, votes_lr5]

    return nb_perf, lr_perf
Ejemplo n.º 3
0
def run_bc(filename, target_class, class_wanted, bc_names):
    # Setup data
    bc_obj = bc.BreastCancer()
    bc_data = bc_obj.setup_data_bc(filename=filename,
                                   target_class=target_class,
                                   class_wanted=class_wanted,
                                   bc_names=bc_names)
    # Setup five fold cross validation
    five_fold = ff.FiveFold()
    bc1, bc2, bc3, bc4, bc5 = five_fold.five_fold_sort_class(
        data=bc_data, sortby=target_class)

    bc_nb1 = nb_bc(bc_data=bc1, target_class=target_class)
    bc_nb2 = nb_bc(bc_data=bc1, target_class=target_class)
    bc_nb3 = nb_bc(bc_data=bc1, target_class=target_class)
    bc_nb4 = nb_bc(bc_data=bc1, target_class=target_class)
    bc_nb5 = nb_bc(bc_data=bc1, target_class=target_class)

    nb_perf = [bc_nb1, bc_nb2, bc_nb3, bc_nb4, bc_nb5]

    bc_lr1 = perform_lr(bc1)
    bc_lr2 = perform_lr(bc2)
    bc_lr3 = perform_lr(bc3)
    bc_lr4 = perform_lr(bc4)
    bc_lr5 = perform_lr(bc5)

    lr_perf = [bc_lr1, bc_lr2, bc_lr3, bc_lr4, bc_lr5]

    return nb_perf, lr_perf
Ejemplo n.º 4
0
def run_iris(filename, target_class, class_wanted, iris_names):
    # Setup data
    iris_obj = iris.Iris()
    iris_data = iris_obj.setup_data_iris(filename=filename,
                                         target_class=target_class,
                                         class_wanted=class_wanted,
                                         iris_names=iris_names)

    # Setup five fold cross validation
    five_fold = ff.FiveFold()
    iris1, iris2, iris3, iris4, iris5 = five_fold.five_fold_sort_class(
        data=iris_data, sortby=target_class)

    iris_nb1 = nb_iris(iris_data=iris1, target_class=target_class)
    iris_nb2 = nb_iris(iris_data=iris1, target_class=target_class)
    iris_nb3 = nb_iris(iris_data=iris1, target_class=target_class)
    iris_nb4 = nb_iris(iris_data=iris1, target_class=target_class)
    iris_nb5 = nb_iris(iris_data=iris1, target_class=target_class)

    nb_perf = [iris_nb1, iris_nb2, iris_nb3, iris_nb4, iris_nb5]

    iris_lr1 = perform_lr(iris1)
    iris_lr2 = perform_lr(iris2)
    iris_lr3 = perform_lr(iris3)
    iris_lr4 = perform_lr(iris4)
    iris_lr5 = perform_lr(iris5)

    lr_perf = [iris_lr1, iris_lr2, iris_lr3, iris_lr4, iris_lr5]

    return nb_perf, lr_perf
Ejemplo n.º 5
0
    def run_seg(self, filename, column_names, sortby):
        # Setup data
        seg = s.Segmentation()
        seg_data = seg.setup_data(filename=filename, column_names=column_names)

        five_fold = ff.FiveFold()
        seg1, seg2, seg3, seg4, seg5 = five_fold.five_fold_sort_class(
            data=seg_data, sortby=sortby)
        return seg_data
Ejemplo n.º 6
0
def run_bc(filename, target_class, class_wanted, bc_names, learning_rate,
           epoch):
    # Setup data
    bc_obj = bc.BreastCancer()
    bc_data = bc_obj.setup_data_bc(filename=filename,
                                   target_class=target_class,
                                   class_wanted=class_wanted,
                                   bc_names=bc_names)
    # Setup five fold cross validation
    five_fold = ff.FiveFold()
    bc1, bc2, bc3, bc4, bc5 = five_fold.five_fold_sort_class(
        data=bc_data, sortby=target_class)

    print("Breast Cancer 0 layers")
    # Run 0 hidden layers
    a01, list01 = run_feedforward_backpropagation(bc1, 0, learning_rate, epoch)
    print("Classification on Breast Cancer 0 hidden layers fold 1:")
    print(list01)
    a02, list02 = run_feedforward_backpropagation(bc2, 0, learning_rate, epoch)
    a03, list03 = run_feedforward_backpropagation(bc3, 0, learning_rate, epoch)
    a04, list04 = run_feedforward_backpropagation(bc4, 0, learning_rate, epoch)
    a05, list05 = run_feedforward_backpropagation(bc5, 0, learning_rate, epoch)

    mean0 = np.average([a01, a02, a03, a04, a05])
    print("Mean Accuracy of Breast Cancer 0 hidden layers: " + str(mean0) +
          "%")
    print()

    print("Breast Cancer 1 hidden layer")
    # Run 1 hidden layers
    a11, list11 = run_feedforward_backpropagation(bc1, 1, learning_rate, epoch)
    print("Classification on Breast Cancer 1 hidden layers fold 1:")
    print(list11)
    a12, list12 = run_feedforward_backpropagation(bc2, 1, learning_rate, epoch)
    a13, list13 = run_feedforward_backpropagation(bc3, 1, learning_rate, epoch)
    a14, list14 = run_feedforward_backpropagation(bc4, 1, learning_rate, epoch)
    a15, list15 = run_feedforward_backpropagation(bc5, 1, learning_rate, epoch)

    mean1 = np.average([a11, a12, a13, a14, a15])
    print("Mean Accuracy of Breast Cancer 1 hidden layers: " + str(mean1) +
          "%")
    print()

    print("Breast Cancer 2 hidden layers")
    # Run 2 hidden layers
    a21, list21 = run_feedforward_backpropagation(bc1, 2, learning_rate, epoch)
    print("Classification on Breast Cancer 2 hidden layers fold 1:")
    print(list21)
    a22, list22 = run_feedforward_backpropagation(bc2, 2, learning_rate, epoch)
    a23, list23 = run_feedforward_backpropagation(bc3, 2, learning_rate, epoch)
    a24, list24 = run_feedforward_backpropagation(bc4, 2, learning_rate, epoch)
    a25, list25 = run_feedforward_backpropagation(bc5, 2, learning_rate, epoch)

    mean2 = np.average([a21, a22, a23, a24, a25])
    print("Mean Accuracy of Breast Cancer 2 hidden layers: " + str(mean2) +
          "%")
    print()
Ejemplo n.º 7
0
 def run_machine(self, filename, column_names, columns_to_drop, sortby):
     # Setup data
     machine = m.Machine()
     machine_data = machine.setup_data(filename=filename,
                                       column_names=column_names,
                                       columns_to_drop=columns_to_drop)
     five_fold = ff.FiveFold()
     mac1, mac2, mac3, mac4, mac5 = five_fold.five_fold_sort_class(
         data=machine_data, sortby=sortby)
     return machine_data
Ejemplo n.º 8
0
    def run_forest(self, filename, column_names, sortby):
        # Setup data
        forest = f.Forest()
        forest_data = forest.setup_data(filename=filename,
                                        column_names=column_names)

        five_fold = ff.FiveFold()
        forest1, forest2, forest3, forest4, forest5 = five_fold.five_fold_sort_class(
            data=forest_data, sortby=sortby)

        return forest_data
Ejemplo n.º 9
0
    def run_ecoli(self, filename, column_names, columns_to_drop, sortby):
        # Setup data
        ecoli = e.Ecoli()
        ecoli_data = ecoli.setup_data(filename=filename,
                                      column_names=column_names,
                                      columns_to_drop=columns_to_drop)

        five_fold = ff.FiveFold()
        ecoli1, ecoli2, ecoli3, ecoli4, ecoli5 = five_fold.five_fold_sort_class(
            data=ecoli_data, sortby=sortby)

        return ecoli_data
Ejemplo n.º 10
0
def run_spambase(filename, target_class):
    # Setup data
    spambase_obj = spambase.Spambase()
    spambase_data = spambase_obj.setup_data_spambase(filename=filename,
                                                     target_class=target_class)

    # Setup five fold cross validation
    five_fold = ff.FiveFold()
    spambase1, spambase2, spambase3, spambase4, spambase5 = five_fold.five_fold_sort_class(
        data=spambase_data, sortby=target_class)

    spambase_nb1 = nb_spambase(spambase_data=spambase1,
                               target_class=target_class)
    spambase_nb2 = nb_spambase(spambase_data=spambase1,
                               target_class=target_class)
    spambase_nb3 = nb_spambase(spambase_data=spambase1,
                               target_class=target_class)
    spambase_nb4 = nb_spambase(spambase_data=spambase1,
                               target_class=target_class)
    spambase_nb5 = nb_spambase(spambase_data=spambase1,
                               target_class=target_class)

    nb_perf = [
        spambase_nb1, spambase_nb2, spambase_nb3, spambase_nb4, spambase_nb5
    ]

    spambase_lr1 = perform_lr(spambase1)
    spambase_lr2 = perform_lr(spambase2)
    spambase_lr3 = perform_lr(spambase3)
    spambase_lr4 = perform_lr(spambase4)
    spambase_lr5 = perform_lr(spambase5)

    lr_perf = [
        spambase_lr1, spambase_lr2, spambase_lr3, spambase_lr4, spambase_lr5
    ]

    return nb_perf, lr_perf
Ejemplo n.º 11
0
def run_iris(filename, target_class, class_wanted, iris_names, learning_rate,
             epoch):
    # Setup data
    iris_obj = iris.Iris()
    iris_data = iris_obj.setup_data_iris(filename=filename,
                                         target_class=target_class,
                                         class_wanted=class_wanted,
                                         iris_names=iris_names)

    # Setup five fold cross validation
    five_fold = ff.FiveFold()
    iris1, iris2, iris3, iris4, iris5 = five_fold.five_fold_sort_class(
        data=iris_data, sortby=target_class)

    print("Iris 0 layers")
    # Run 0 hidden layers
    a01, list01 = run_feedforward_backpropagation(iris1, 0, learning_rate,
                                                  epoch)
    print("Classification on Iris 0 hidden layers fold 1:")
    print(list01)
    a02, list02 = run_feedforward_backpropagation(iris2, 0, learning_rate,
                                                  epoch)
    a03, list03 = run_feedforward_backpropagation(iris3, 0, learning_rate,
                                                  epoch)
    a04, list04 = run_feedforward_backpropagation(iris4, 0, learning_rate,
                                                  epoch)
    a05, list05 = run_feedforward_backpropagation(iris5, 0, learning_rate,
                                                  epoch)

    mean0 = np.average([a01, a02, a03, a04, a05])
    print("Mean Accuracy of Iris 0 hidden layers: " + str(mean0) + "%")
    print()

    print("Iris 1 hidden layer")
    # Run 1 hidden layers
    a11, list11 = run_feedforward_backpropagation(iris1, 1, learning_rate,
                                                  epoch)
    print("Classification on Iris 1 hidden layers fold 1:")
    print(list11)
    a12, list12 = run_feedforward_backpropagation(iris2, 1, learning_rate,
                                                  epoch)
    a13, list13 = run_feedforward_backpropagation(iris3, 1, learning_rate,
                                                  epoch)
    a14, list14 = run_feedforward_backpropagation(iris4, 1, learning_rate,
                                                  epoch)
    a15, list15 = run_feedforward_backpropagation(iris5, 1, learning_rate,
                                                  epoch)

    mean1 = np.average([a11, a12, a13, a14, a15])
    print("Mean Accuracy of Iris 1 hidden layers: " + str(mean1) + "%")
    print()

    print("Iris 2 hidden layers")
    # Run 2 hidden layers
    a21, list21 = run_feedforward_backpropagation(iris1, 2, learning_rate,
                                                  epoch)
    print("Classification on Iris 2 hidden layers fold 1:")
    print(list21)
    a22, list22 = run_feedforward_backpropagation(iris2, 2, learning_rate,
                                                  epoch)
    a23, list23 = run_feedforward_backpropagation(iris3, 2, learning_rate,
                                                  epoch)
    a24, list24 = run_feedforward_backpropagation(iris4, 2, learning_rate,
                                                  epoch)
    a25, list25 = run_feedforward_backpropagation(iris5, 2, learning_rate,
                                                  epoch)

    mean2 = np.average([a21, a22, a23, a24, a25])
    print("Mean Accuracy of Iris 2 hidden layers: " + str(mean2) + "%")
    print()
Ejemplo n.º 12
0
def run_soybean(filename, target_class, learning_rate, epoch):
    # Setup data
    soybean_obj = soybean.Soybean()
    soybean_data = soybean_obj.setup_data_soybean(filename=filename,
                                                  target_class=target_class)

    # Setup five fold cross validation
    five_fold = ff.FiveFold()
    soybean1, soybean2, soybean3, soybean4, soybean5 = five_fold.five_fold_sort_class(
        data=soybean_data, sortby=target_class)

    print("Soybean 0 layers")
    # Run 0 hidden layers
    a01, list01 = run_feedforward_backpropagation(soybean1, 0, learning_rate,
                                                  epoch)
    print("Classification on Soybean 0 hidden layers fold 1:")
    print(list01)
    a02, list02 = run_feedforward_backpropagation(soybean2, 0, learning_rate,
                                                  epoch)
    a03, list03 = run_feedforward_backpropagation(soybean3, 0, learning_rate,
                                                  epoch)
    a04, list04 = run_feedforward_backpropagation(soybean4, 0, learning_rate,
                                                  epoch)
    a05, list05 = run_feedforward_backpropagation(soybean5, 0, learning_rate,
                                                  epoch)

    mean0 = np.average([a01, a02, a03, a04, a05])
    print("Mean Accuracy of Soybean 0 hidden layers: " + str(mean0) + "%")
    print()

    print("Soybean 1 hidden layer")
    # Run 1 hidden layers
    a11, list11 = run_feedforward_backpropagation(soybean1, 1, learning_rate,
                                                  epoch)
    print("Classification on Soybean 1 hidden layers fold 1:")
    print(list11)
    a12, list12 = run_feedforward_backpropagation(soybean2, 1, learning_rate,
                                                  epoch)
    a13, list13 = run_feedforward_backpropagation(soybean3, 1, learning_rate,
                                                  epoch)
    a14, list14 = run_feedforward_backpropagation(soybean4, 1, learning_rate,
                                                  epoch)
    a15, list15 = run_feedforward_backpropagation(soybean5, 1, learning_rate,
                                                  epoch)

    mean1 = np.average([a11, a12, a13, a14, a15])
    print("Mean Accuracy of Soybean 1 hidden layers: " + str(mean1) + "%")
    print()

    print("Soybean 2 hidden layers")
    # Run 2 hidden layers
    a21, list21 = run_feedforward_backpropagation(soybean1, 2, learning_rate,
                                                  epoch)
    print("Classification on Soybean 2 hidden layers fold 1:")
    print(list21)
    a22, list22 = run_feedforward_backpropagation(soybean2, 2, learning_rate,
                                                  epoch)
    a23, list23 = run_feedforward_backpropagation(soybean3, 2, learning_rate,
                                                  epoch)
    a24, list24 = run_feedforward_backpropagation(soybean4, 2, learning_rate,
                                                  epoch)
    a25, list25 = run_feedforward_backpropagation(soybean5, 2, learning_rate,
                                                  epoch)

    mean2 = np.average([a21, a22, a23, a24, a25])
    print("Mean Accuracy of Soybean 2 hidden layers: " + str(mean2) + "%")
    print()
Ejemplo n.º 13
0
    def run_car(self, filename, sortby):
        print()
        car_names = [
            "buying", "maint", "doors", "persons", "lug boot", "safety",
            "class"
        ]
        # Setup data
        car = c.Car()
        car_data = car.setup_data(filename=filename, column_names=car_names)
        # Split the data set into 10% and 90%
        car_validation_data = car_data.sample(frac=.10)
        car_data_rest = car_data.drop(car_validation_data.index)
        # Reset indexes on data frames
        car_validation_data.reset_index(inplace=True)
        car_data_rest.reset_index(inplace=True)

        # print(car_validation_data)
        # print()
        # print(car_data_rest)

        # Setup five fold cross validation
        five_fold = ff.FiveFold()
        car1, car2, car3, car4, car5 = five_fold.five_fold_sort_class(
            data=car_data_rest, sortby=sortby)
        car1.drop(columns='index', axis=1, inplace=True)
        car2.drop(columns='index', axis=1, inplace=True)
        car3.drop(columns='index', axis=1, inplace=True)
        car4.drop(columns='index', axis=1, inplace=True)
        car5.drop(columns='index', axis=1, inplace=True)
        """
            This next section will run 5 different variations with the 5 different data sets that were made above using
            five fold cross validation against the validation set specified in the project.  It does a 90/10 split on 
            the data, where 90% is used for cross validation, and 10% is used for the validation set.
        """
        tree1 = dt.DecisionTree()
        tree_node1 = tree1.create_decision_tree(data=car1,
                                                features_list=car_names)
        accuracy1 = tree1.run_test(car_validation_data, tree_node1)
        print("Unpruned accuracy for car1: " + str(accuracy1) + "%")
        tree_pruned1 = tree1.prune_tree(car_validation_data, tree_node1)
        accuracy_pruned1 = tree1.run_test(car_validation_data, tree_pruned1)
        print("Pruned accuracy for car1: " + str(accuracy_pruned1) + "%")
        print()

        car_names = [
            "buying", "maint", "doors", "persons", "lug boot", "safety",
            "class"
        ]
        tree2 = dt.DecisionTree()
        tree_node2 = tree2.create_decision_tree(data=car2,
                                                features_list=car_names)
        accuracy2 = tree2.run_test(car_validation_data, tree_node2)
        print("Unpruned accuracy for car2: " + str(accuracy2) + "%")
        tree_pruned2 = tree2.prune_tree(car_validation_data, tree_node2)
        accuracy_pruned2 = tree2.run_test(car_validation_data, tree_pruned2)
        print("Pruned accuracy for car2: " + str(accuracy_pruned2) + "%")
        print()

        car_names = [
            "buying", "maint", "doors", "persons", "lug boot", "safety",
            "class"
        ]
        tree3 = dt.DecisionTree()
        tree_node3 = tree3.create_decision_tree(data=car3,
                                                features_list=car_names)
        accuracy3 = tree3.run_test(car_validation_data, tree_node3)
        print("Unpruned accuracy for car3: " + str(accuracy3) + "%")
        tree_pruned3 = tree3.prune_tree(car_validation_data, tree_node3)
        accuracy_pruned3 = tree3.run_test(car_validation_data, tree_pruned3)
        print("Pruned accuracy for car3: " + str(accuracy_pruned3) + "%")
        print()

        car_names = [
            "buying", "maint", "doors", "persons", "lug boot", "safety",
            "class"
        ]
        tree4 = dt.DecisionTree()
        tree_node4 = tree4.create_decision_tree(data=car4,
                                                features_list=car_names)
        accuracy4 = tree4.run_test(car_validation_data, tree_node4)
        print("Unpruned accuracy for car4: " + str(accuracy4) + "%")
        tree_pruned4 = tree4.prune_tree(car_validation_data, tree_node4)
        accuracy_pruned4 = tree4.run_test(car_validation_data, tree_pruned4)
        print("Pruned accuracy for car4: " + str(accuracy_pruned4) + "%")
        print()

        car_names = [
            "buying", "maint", "doors", "persons", "lug boot", "safety",
            "class"
        ]
        tree5 = dt.DecisionTree()
        tree_node5 = tree5.create_decision_tree(data=car5,
                                                features_list=car_names)
        accuracy5 = tree5.run_test(car_validation_data, tree_node5)
        print("Unpruned accuracy for car5: " + str(accuracy5) + "%")
        tree_pruned5 = tree5.prune_tree(car_validation_data, tree_node5)
        accuracy_pruned5 = tree5.run_test(car_validation_data, tree_pruned5)
        print("Pruned accuracy for car5: " + str(accuracy_pruned5) + "%")
        print()

        unpruned_accuracy_average = np.average(
            [accuracy1, accuracy2, accuracy3, accuracy4, accuracy5])
        pruned_accuracy_average = np.average([
            accuracy_pruned1, accuracy_pruned2, accuracy_pruned3,
            accuracy_pruned4, accuracy_pruned5
        ])

        print("Unpruned accuracy average for car data: " +
              str(unpruned_accuracy_average) + "%")
        print("Pruned accuracy average for car data: " +
              str(pruned_accuracy_average) + "%")
        print()
Ejemplo n.º 14
0
    def run_seg(self, filename, sortby):
        print()
        seg_names = [
            "class", "cen col", "cen row", "pix count", "sld -5", "sld -2",
            "vedge mean", "vedge sd", "hedge mean", "hedge sd",
            "intensity mean", "rawred mean", "rawblue mean", "rawgreen mean",
            "exred mean", "exblue mean", "exgreen mean", "value mean",
            "sat mean", "hue mean"
        ]
        # Setup data
        seg = s.Segmentation()
        seg_data = seg.setup_data(filename=filename, column_names=seg_names)
        # Split the data set into 10% and 90%
        seg_validation_data = seg_data.sample(frac=.10)
        seg_data_rest = seg_data.drop(seg_validation_data.index)
        # Reset indexes on data frames
        seg_validation_data.reset_index(inplace=True)
        seg_data_rest.reset_index(inplace=True)

        # Setup five fold cross validation
        five_fold = ff.FiveFold()
        seg1, seg2, seg3, seg4, seg5 = five_fold.five_fold_sort_class(
            data=seg_data_rest, sortby=sortby)
        seg1.drop(columns='index', axis=1, inplace=True)
        seg2.drop(columns='index', axis=1, inplace=True)
        seg3.drop(columns='index', axis=1, inplace=True)
        seg4.drop(columns='index', axis=1, inplace=True)
        seg5.drop(columns='index', axis=1, inplace=True)
        """
            This next section will run 5 different variations with the 5 different data sets that were made above using
            five fold cross validation against the validation set specified in the project.  It does a 90/10 split on 
            the data, where 90% is used for cross validation, and 10% is used for the validation set.
        """
        tree1 = dt.DecisionTree()
        tree_node1 = tree1.create_decision_tree(data=seg1,
                                                features_list=seg_names)
        accuracy1 = tree1.run_test(seg_validation_data, tree_node1)
        print("Unpruned accuracy for seg1: " + str(accuracy1) + "%")
        tree_pruned1 = tree1.prune_tree(seg_validation_data, tree_node1)
        accuracy_pruned1 = tree1.run_test(seg_validation_data, tree_pruned1)
        print("Pruned accuracy for seg1: " + str(accuracy_pruned1) + "%")
        print()

        seg_names = [
            "class", "cen col", "cen row", "pix count", "sld -5", "sld -2",
            "vedge mean", "vedge sd", "hedge mean", "hedge sd",
            "intensity mean", "rawred mean", "rawblue mean", "rawgreen mean",
            "exred mean", "exblue mean", "exgreen mean", "value mean",
            "sat mean", "hue mean"
        ]
        tree2 = dt.DecisionTree()
        tree_node2 = tree2.create_decision_tree(data=seg2,
                                                features_list=seg_names)
        accuracy2 = tree2.run_test(seg_validation_data, tree_node2)
        print("Unpruned accuracy for seg2: " + str(accuracy2) + "%")
        tree_pruned2 = tree2.prune_tree(seg_validation_data, tree_node2)
        accuracy_pruned2 = tree2.run_test(seg_validation_data, tree_pruned2)
        print("Pruned accuracy for seg2: " + str(accuracy_pruned2) + "%")
        print()

        seg_names = [
            "class", "cen col", "cen row", "pix count", "sld -5", "sld -2",
            "vedge mean", "vedge sd", "hedge mean", "hedge sd",
            "intensity mean", "rawred mean", "rawblue mean", "rawgreen mean",
            "exred mean", "exblue mean", "exgreen mean", "value mean",
            "sat mean", "hue mean"
        ]
        tree3 = dt.DecisionTree()
        tree_node3 = tree3.create_decision_tree(data=seg3,
                                                features_list=seg_names)
        accuracy3 = tree3.run_test(seg_validation_data, tree_node3)
        print("Unpruned accuracy for seg3: " + str(accuracy3) + "%")
        tree_pruned3 = tree3.prune_tree(seg_validation_data, tree_node3)
        accuracy_pruned3 = tree3.run_test(seg_validation_data, tree_pruned3)
        print("Pruned accuracy for seg3: " + str(accuracy_pruned3) + "%")
        print()

        seg_names = [
            "class", "cen col", "cen row", "pix count", "sld -5", "sld -2",
            "vedge mean", "vedge sd", "hedge mean", "hedge sd",
            "intensity mean", "rawred mean", "rawblue mean", "rawgreen mean",
            "exred mean", "exblue mean", "exgreen mean", "value mean",
            "sat mean", "hue mean"
        ]
        tree4 = dt.DecisionTree()
        tree_node4 = tree4.create_decision_tree(data=seg4,
                                                features_list=seg_names)
        accuracy4 = tree4.run_test(seg_validation_data, tree_node4)
        print("Unpruned accuracy for seg4: " + str(accuracy4) + "%")
        tree_pruned4 = tree4.prune_tree(seg_validation_data, tree_node4)
        accuracy_pruned4 = tree4.run_test(seg_validation_data, tree_pruned4)
        print("Pruned accuracy for seg4: " + str(accuracy_pruned4) + "%")
        print()

        seg_names = [
            "class", "cen col", "cen row", "pix count", "sld -5", "sld -2",
            "vedge mean", "vedge sd", "hedge mean", "hedge sd",
            "intensity mean", "rawred mean", "rawblue mean", "rawgreen mean",
            "exred mean", "exblue mean", "exgreen mean", "value mean",
            "sat mean", "hue mean"
        ]
        tree5 = dt.DecisionTree()
        tree_node5 = tree5.create_decision_tree(data=seg5,
                                                features_list=seg_names)
        accuracy5 = tree5.run_test(seg_validation_data, tree_node5)
        print("Unpruned accuracy for seg5: " + str(accuracy5) + "%")
        tree_pruned5 = tree5.prune_tree(seg_validation_data, tree_node5)
        accuracy_pruned5 = tree5.run_test(seg_validation_data, tree_pruned5)
        print("Pruned accuracy for seg5: " + str(accuracy_pruned5) + "%")
        print()

        unpruned_accuracy_average = np.average(
            [accuracy1, accuracy2, accuracy3, accuracy4, accuracy5])
        pruned_accuracy_average = np.average([
            accuracy_pruned1, accuracy_pruned2, accuracy_pruned3,
            accuracy_pruned4, accuracy_pruned5
        ])

        print("Unpruned accuracy average for seg data: " +
              str(unpruned_accuracy_average) + "%")
        print("Pruned accuracy average for seg data: " +
              str(pruned_accuracy_average) + "%")
        print()
Ejemplo n.º 15
0
    def run_abalone(self, filename, sortby):
        print()
        abalone_names = [
            "sex", "length", "diameter", "height", "whole weight",
            "shucked weight", "viscera weight", "shell weight", "class"
        ]
        # Setup data
        abalone = a.Abalone()
        abalone_data = abalone.setup_data(filename=filename,
                                          column_names=abalone_names)
        # Split the data set into 10% and 90%
        abalone_validation_data = abalone_data.sample(frac=.10)
        abalone_data_rest = abalone_data.drop(abalone_validation_data.index)
        # Reset indexes on data frames
        abalone_validation_data.reset_index(inplace=True)
        abalone_data_rest.reset_index(inplace=True)

        # Setup five fold cross validation
        five_fold = ff.FiveFold()
        abalone1, abalone2, abalone3, abalone4, abalone5 = five_fold.five_fold_sort_class(
            data=abalone_data, sortby=sortby)
        """
            This next section will run 5 different variations with the 5 different data sets that were made above using
            five fold cross validation against the validation set specified in the project.  It does a 90/10 split on 
            the data, where 90% is used for cross validation, and 10% is used for the validation set.
        """
        tree1 = dt.DecisionTree()
        tree_node1 = tree1.create_decision_tree(data=abalone1,
                                                features_list=abalone_names)
        accuracy1 = tree1.run_test(abalone_validation_data, tree_node1)
        print("Unpruned accuracy for abalone1: " + str(accuracy1) + "%")
        tree_pruned1 = tree1.prune_tree(abalone_validation_data, tree_node1)
        accuracy_pruned1 = tree1.run_test(abalone_validation_data,
                                          tree_pruned1)
        print("Pruned accuracy for abalone1: " + str(accuracy_pruned1) + "%")
        print()

        abalone_names = [
            "sex", "length", "diameter", "height", "whole weight",
            "shucked weight", "viscera weight", "shell weight", "class"
        ]
        tree2 = dt.DecisionTree()
        tree_node2 = tree2.create_decision_tree(data=abalone2,
                                                features_list=abalone_names)
        accuracy2 = tree2.run_test(abalone_validation_data, tree_node2)
        print("Unpruned accuracy for abalone2: " + str(accuracy2) + "%")
        tree_pruned2 = tree2.prune_tree(abalone_validation_data, tree_node2)
        accuracy_pruned2 = tree2.run_test(abalone_validation_data,
                                          tree_pruned2)
        print("Pruned accuracy for abalone2: " + str(accuracy_pruned2) + "%")
        print()

        abalone_names = [
            "sex", "length", "diameter", "height", "whole weight",
            "shucked weight", "viscera weight", "shell weight", "class"
        ]
        tree3 = dt.DecisionTree()
        tree_node3 = tree3.create_decision_tree(data=abalone3,
                                                features_list=abalone_names)
        accuracy3 = tree3.run_test(abalone_validation_data, tree_node3)
        print("Unpruned accuracy for abalone3: " + str(accuracy3) + "%")
        tree_pruned3 = tree3.prune_tree(abalone_validation_data, tree_node3)
        accuracy_pruned3 = tree3.run_test(abalone_validation_data,
                                          tree_pruned3)
        print("Pruned accuracy for abalone3: " + str(accuracy_pruned3) + "%")
        print()

        abalone_names = [
            "sex", "length", "diameter", "height", "whole weight",
            "shucked weight", "viscera weight", "shell weight", "class"
        ]
        tree4 = dt.DecisionTree()
        tree_node4 = tree4.create_decision_tree(data=abalone4,
                                                features_list=abalone_names)
        accuracy4 = tree4.run_test(abalone_validation_data, tree_node4)
        print("Unpruned accuracy for abalone4: " + str(accuracy4) + "%")
        tree_pruned4 = tree4.prune_tree(abalone_validation_data, tree_node4)
        accuracy_pruned4 = tree4.run_test(abalone_validation_data,
                                          tree_pruned4)
        print("Pruned accuracy for abalone4: " + str(accuracy_pruned4) + "%")
        print()

        abalone_names = [
            "sex", "length", "diameter", "height", "whole weight",
            "shucked weight", "viscera weight", "shell weight", "class"
        ]
        tree5 = dt.DecisionTree()
        tree_node5 = tree5.create_decision_tree(data=abalone5,
                                                features_list=abalone_names)
        accuracy5 = tree5.run_test(abalone_validation_data, tree_node5)
        print("Unpruned accuracy for abalone5: " + str(accuracy5) + "%")
        tree_pruned5 = tree5.prune_tree(abalone_validation_data, tree_node5)
        accuracy_pruned5 = tree5.run_test(abalone_validation_data,
                                          tree_pruned5)
        print("Pruned accuracy for abalone5: " + str(accuracy_pruned5) + "%")
        print()

        unpruned_accuracy_average = np.average(
            [accuracy1, accuracy2, accuracy3, accuracy4, accuracy5])
        pruned_accuracy_average = np.average([
            accuracy_pruned1, accuracy_pruned2, accuracy_pruned3,
            accuracy_pruned4, accuracy_pruned5
        ])

        print("Unpruned accuracy average for abalone data: " +
              str(unpruned_accuracy_average) + "%")
        print("Pruned accuracy average for abalone data: " +
              str(pruned_accuracy_average) + "%")
        print()