Ejemplo n.º 1
0
def main():

    data_sets = ["binary/balance-scale.csv", "binary/tic-tac-toe.csv", "binary/ionosphere.csv",
                 "multi/splice.csv", "multi/glass.csv", "multi/segment.csv"]

    folds = 2

    classifier_types = {}

    max_depth = [d for d in range(5, 50+1, 5)]
    min_samples_leaf = [d for d in range(5, 50+1, 5)]
    test_result = numpy.zeros((len(max_depth), len(min_samples_leaf)))
    for data in data_sets:
        set_data, set_target = functions.get_data_from_csv(data)

        kf = cross_validation.KFold(set_data.shape[0], n_folds=folds)

        for i in range(len(max_depth)):
            for j in range(len(min_samples_leaf)):
                classifier_types["dt"] = BinaryTreeClassifier(max_depth=max_depth[i], min_samples_leaf=min_samples_leaf[j])
                # classifier_types["dtc"] = BinaryTreeClassifier(max_depth=max_depth[i], min_samples_leaf=min_samples_leaf[j])
                # classifier_types["rf"] = RandomForest(max_depth=max_depth[i],min_samples_leaf=min_samples_leaf[j])
                # classifier_types["rfc"] = RandomForestClassifier(max_depth=max_depth[i],min_samples_leaf=min_samples_leaf[j])
                # classifier_types["knn"] = KNeighborsClassifier(n_neighbors=max_depth[i], leaf_size=min_samples_leaf[j])

                for c_type, classifier in classifier_types.iteritems():
                    result = list()

                    average_accuracy = 0.0

                    for train, test in kf:
                        train_set = set_data[train]
                        train_class = set_target[train]
                        test_set = set_data[test]
                        test_class = set_target[test]

                        classifier.fit(train_set, train_class.ravel())

                        pred = classifier.predict(test_set)

                        result.append(functions.predictions_to_values(test_class, pred))

                    for res in range(len(result)):
                        average_accuracy += metrics.accuracy_score(result[res][0], result[res][1])

                    average_accuracy /= float(len(result))
                    test_result[i][j] += average_accuracy / len(data_sets) / len(classifier_types)

    print "{0:5}".format(""),
    for i in min_samples_leaf:
        print "{0:<5d}".format(i),
    print "\n",
    for i in range(len(max_depth)):
        print "{0:>5d}".format(max_depth[i]),
        for j in range(len(test_result[i])):
            print "{0:<5.3f}".format(test_result[i][j]),
        print "\n",
Ejemplo n.º 2
0
def main():

    data_sets = ["binary/breast-w.csv", "binary/diabetes.csv", "binary/labor.csv",
                 "multi/iris.csv", "multi/letter.csv", "multi/vehicle.csv"]

    # Data set for experiment 3
    # data_sets = ["raop.csv"]

    folds = 10

    dt = BinaryTreeClassifier(max_depth=30, min_samples_leaf=10)
    dtc = DecisionTreeClassifier(max_depth=50, min_samples_leaf=5)
    rf = RandomForest(max_depth=40, min_samples_leaf=15)
    rfc = RandomForestClassifier(max_depth=50, min_samples_leaf=5)
    kNN = KNeighborsClassifier(n_neighbors=10, leaf_size=10)

    classifier_types = {"dt": dt, "rf": rf, "dtc": dtc, "rfc": rfc, "kNN": kNN}

    for i in data_sets:
        set_data, set_target = functions.get_data_from_csv(i)

        kf = cross_validation.KFold(set_data.shape[0], n_folds=folds)
        print("\n{0}{1}".format("Data set: ", i))
        print("{0:5}".format("Accuracy:"))
        for c_type, classifier in classifier_types.items():
            result = list()

            average_acc = 0.0

            for train, test in kf:
                train_set = set_data[train]
                train_class = set_target[train]
                test_set = set_data[test]
                test_class = set_target[test]

                classifier.fit(train_set, train_class.ravel())

                pred = classifier.predict(test_set)

                result.append(functions.predictions_to_values(test_class, pred))

            for r in range(len(result)):
                average_acc += metrics.accuracy_score(result[r][0], result[r][1])

            average_acc /= float(len(result))

            print("{0:5}{1:5.3f}".format(c_type, average_acc))
Ejemplo n.º 3
0
def main():

    folds = 10

    dt = BinaryTreeClassifier(max_depth=30, min_samples_leaf=10)
    dtc = DecisionTreeClassifier(max_depth=50, min_samples_leaf=5)
    rf = RandomForest(max_depth=40, min_samples_leaf=15)
    rfc = RandomForestClassifier(max_depth=50, min_samples_leaf=5)
    kNN = KNeighborsClassifier(n_neighbors=10, leaf_size=10)

    classifier_types = {"dt": dt, "rf": rf, "dtc": dtc, "rfc": rfc, "kNN": kNN}

    results = {}

    set_data, set_target = functions.get_data_from_csv("raop.csv")

    kf = cross_validation.KFold(set_data.shape[0], n_folds=folds)

    for c_type, classifier in classifier_types.items():
        results[c_type] = 0

        for train, test in kf:
            train_set = set_data[train]
            train_class = set_target[train]
            test_set = set_data[test]
            test_class = set_target[test]

            classifier.fit(train_set, train_class.ravel())

            prob = classifier.predict(test_set)

            true_set, pred_set = functions.predictions_to_values(test_class, prob)

            results[c_type] += true_set[true_set == pred_set].size

    processed = []
    for c_type, classifier in results.items():
        for c_type_2, classifier_2 in results.items():
            if c_type != c_type_2:
                if c_type_2 not in processed:
                    print("{0:>7} vs {1:7} {2:<5.6f}".format(c_type, c_type_2, mcnemar_midp(classifier, classifier_2)))
        processed.append(c_type)
Ejemplo n.º 4
0
def main():

    # # Tests for  multi datasets
    set_data, set_target = functions.get_data_from_csv("multi/glass.csv")
    # set_data, set_target = functions.get_data_from_csv("multi/iris.csv")
    # set_data, set_target = functions.get_data_from_csv("multi/letter.csv")
    # set_data, set_target = functions.get_data_from_csv("multi/segment.csv")
    # set_data, set_target = functions.get_data_from_csv("multi/splice.csv")
    # set_data, set_target = functions.get_data_from_csv("multi/waveform-5000.csv")
    # set_data, set_target = functions.get_data_from_csv("multi/vehicle.csv")
    #
    # # Tests for binary datasets
    # set_data, set_target = functions.get_data_from_csv("binary/balance-scale.csv")
    # set_data, set_target = functions.get_data_from_csv("binary/breast-cancer.csv")
    # set_data, set_target = functions.get_data_from_csv("binary/breast-w.csv")
    # set_data, set_target = functions.get_data_from_csv("binary/credit-a.csv")
    # set_data, set_target = functions.get_data_from_csv("binary/credit-g.csv")
    # set_data, set_target = functions.get_data_from_csv("binary/diabetes.csv")
    # set_data, set_target = functions.get_data_from_csv("binary/haberman.csv")
    # set_data, set_target = functions.get_data_from_csv("binary/heart-c.csv")
    # set_data, set_target = functions.get_data_from_csv("binary/heart-h.csv")
    # set_data, set_target = functions.get_data_from_csv("binary/heart-s.csv")
    # set_data, set_target = functions.get_data_from_csv("binary/hepatitis.csv")
    # set_data, set_target = functions.get_data_from_csv("binary/ionosphere.csv")
    # set_data, set_target = functions.get_data_from_csv("binary/kr-vs-kp.csv")
    # set_data, set_target = functions.get_data_from_csv("binary/labor.csv")
    # set_data, set_target = functions.get_data_from_csv("binary/liver-disorders.csv")
    # set_data, set_target = functions.get_data_from_csv("binary/mushroom.csv")
    # set_data, set_target = functions.get_data_from_csv("binary/sick.csv")
    # set_data, set_target = functions.get_data_from_csv("binary/sonar.csv")
    # set_data, set_target = functions.get_data_from_csv("binary/spambase.csv")
    # set_data, set_target = functions.get_data_from_csv("binary/tic-tac-toe.csv")

    # dt, rf are our implementations
    dt = BinaryTreeClassifier()
    dtc = DecisionTreeClassifier()
    rf = RandomForest()
    rfc = RandomForestClassifier()

    classifier_types = {"dt": dt, "rf": rf, "dtc": dtc, "rfc": rfc}

    printed = False
    folds = 10
    kf = cross_validation.KFold(set_data.shape[0], n_folds=folds)
    accuracy_result = {}

    for c_type, classifier in classifier_types.iteritems():

        result = list()
        prob = list()

        average_accuracy = 0.0
        average_precision = 0.0
        average_recall = 0.0
        average_auc = 0.0
        average_train_time = 0.0
        average_test_time = 0.0

        print(c_type)

        for train, test in kf:
            train_set = set_data[train]
            train_class = set_target[train]
            test_set = set_data[test]
            test_class = set_target[test]

            timer = time()
            classifier.fit(train_set, train_class.ravel())
            if c_type == 'dt' and not printed:
                classifier.print_tree_wrapper()
                printed = True
            average_train_time += time() - timer

            timer = time()
            pred = classifier.predict(test_set)
            prob.append(classifier.predict_proba(test_set))
            average_test_time += time() - timer

            result.append(functions.predictions_to_values(test_class, pred))

        accuracy_result[c_type] = list()

        for res in range(len(result)):
            average_accuracy += metrics.accuracy_score(result[res][0], result[res][1])
            accuracy_result[c_type].append(metrics.accuracy_score(result[res][0], result[res][1]))
            average_precision += metrics.precision_score(result[res][0], result[res][1])
            average_recall += metrics.recall_score(result[res][0], result[res][1])

            unique = numpy.unique(result[res][0])
            proba = numpy.array(prob[res])
            max_proba = numpy.array([max(d) for d in proba])

            if len(unique) > 2:
                for i in unique:
                    fpr, tpr, thresholds = metrics.roc_curve(result[res][0], max_proba, pos_label=int(i))
                    average_auc += metrics.auc(fpr, tpr)
                average_auc /= unique.size / 2.0
            else:
                fpr, tpr, thresholds = metrics.roc_curve(result[res][0], max_proba)
                average_auc += metrics.auc(fpr, tpr)

        print ("Accuracy: "),
        print('{0:.3f}'.format(average_accuracy / float(len(result))))
        print ("Precision: "),
        print('{0:.3f}'.format(average_precision / float(len(result))))
        print ("Recall: "),
        print('{0:.3f}'.format(average_recall / float(len(result))))
        print ("Auc: "),
        print('{0:.3f}'.format(average_auc / float(len(result))))
        print ("")
        print('Training time: '),
        print(average_train_time / float(len(result)))
        print('Test time: '),
        print(average_test_time / float(len(result)))

    print ("Wilcoxon Decision Tree Classifiers: "),
    print (wilcoxon(accuracy_result['dt'], accuracy_result['dtc']))
    print ("Wilcoxon Random Forest Classifiers: "),
    print (wilcoxon(accuracy_result['rf'], accuracy_result['rfc']))