Beispiel #1
0
def test_case():
    data = pandas.read_csv(
        r"ILS Projekt Dataset\csv_binary\binary\diabetes.csv", header=None)
    data = pandas.np.array(data)
    features, labels = unzip_features_and_labels(data)

    fac = ClassifierFactory()
    fac.set_data(features, labels)

    max_features = 1
    max_features_step = 10

    min_sample_leafs = 1
    min_sample_leafs_step = 10

    iterations = 2

    parameter_iterator = Parameters(iterations,
                                    [max_features, min_sample_leafs],
                                    [max_features_step, min_sample_leafs_step])

    fac.set_parameter_iterator(parameter_iterator)
    fac.set_classifier_factory(lambda params: OurDecisionTreeClassifier(
        max_features=params[0], min_sample_leaf=params[1]))
    grid_search(fac)
def start(data_set):
    accuracies1 = []
    precisions1 = []
    recalls1 = []
    aucs1 = []
    our_training_time = []
    our_testing_time = []

    odtc = OurDecisionTreeClassifier()

    data_set = pandas.np.array(data_set)
    features_, labels_ = unzip_features_and_labels(data_set)

    train_features, test_features, train_labels, test_labels = \
        train_test_split(
            features_, labels_,
            test_size=0.1,
            random_state=int(round(time.time()))
        )
    # un-numpy the arrays before predicting
    train_features, test_features, train_labels, test_labels = flatten_num_py_arrays(
        [train_features, test_features, train_labels, test_labels])
    # train and test our tree
    start_our_fit = time.time()
    odtc.fit(train_features, train_labels)
    end_our_fit = time.time()
    our_fit = end_our_fit - start_our_fit
    ops = time.time()
    prediction1 = odtc.predict(test_features)
    ope = time.time()
    our_predict = ope - ops

    a1, p1, r1, auc1 = compare(prediction1, test_labels, labels_)

    accuracies1.append(a1)
    precisions1.append(p1)
    recalls1.append(r1)
    aucs1.append(auc1)
    our_training_time.append(our_fit)
    our_testing_time.append(our_predict)

    print("\n----------------------------------------")
    print("\nFor our Decision Tree Classifier:")
    print("Our average accuracy:", numpy.array(accuracies1).mean())
    print("Our average precision:", numpy.array(precisions1).mean())
    print("Our average recall:", numpy.array(recalls1).mean())
    print("Our average AUC:", numpy.array(aucs1).mean())
    print("Our average training time:", numpy.array(our_training_time).mean())
    print("Our average testing time:", numpy.array(our_testing_time).mean())
    print("\n----------------------------------------\n\n")

    print_tree_vertical(odtc.model)
def wilcoxon_test(data_file):
    odtc = OurDecisionTreeClassifier()
    tdtc = DecisionTreeClassifier()
    orf = OurRandomForrestClassifier(sample_size=0.3, n_estimators=11)
    trf = RandomForestClassifier(n_estimators=11)

    our_accuracy_dtc_array = []
    our_accuracy_rf_array = []
    their_accuracy_dtc_array = []
    their_accuracy_rf_array = []

    for i in range(20):
        data_set = pandas.np.array(data_file)
        features_, labels_ = unzip_features_and_labels(data_set)
        train_features, test_features, train_labels, test_labels = \
            train_test_split(
                features_, labels_,
                test_size=0.1,
                random_state=int(round(time.time()))
            )

        odtc.fit(train_features, train_labels)
        tdtc.fit(train_features, train_labels)
        orf.fit(train_features, train_labels)
        trf.fit(train_features, train_labels.ravel())
        pre1 = odtc.predict(test_features)
        pre2 = tdtc.predict(test_features)
        pre3 = orf.predict(test_features)
        pre4 = trf.predict(test_features)
        our_accuracy_dtc_array.append(accuracy_test(pre1, test_labels))
        their_accuracy_dtc_array.append(accuracy_test(pre2, test_labels))
        our_accuracy_rf_array.append(accuracy_test(pre3, test_labels))
        their_accuracy_rf_array.append(accuracy_test(pre4, test_labels))

    print(our_accuracy_dtc_array)
    print(their_accuracy_dtc_array)
    w_dtc = wilcoxon(our_accuracy_dtc_array, their_accuracy_dtc_array)
    print("-----------------------------------------------------------")
    print(our_accuracy_rf_array)
    print(their_accuracy_rf_array)
    w_rf = wilcoxon(our_accuracy_rf_array, their_accuracy_rf_array)

    print("\nDecisionTreeClassifiers:", w_dtc)
    print("\nRandomForestClassifiers", w_rf)

    doc = Excelifyer(use_column_headers=False)
    doc.at_row(0, 'headers', ['statistic', 'pvalues'])
    doc.at_row(1, 'DecisionTreeClassifiers', w_dtc)
    doc.at_row(2, 'RandomForestClassifiers', w_rf)
    doc.to_excel('test.xlsx')
def start(data_set, rf_flag=False, max_features=None):
    accuracies1 = []
    accuracies2 = []
    precisions1 = []
    precisions2 = []
    recalls1 = []
    recalls2 = []
    aucs1 = []
    aucs2 = []
    our_training_time = []
    our_testing_time = []
    their_training_time = []
    their_testing_time = []

    accuracies3 = []
    accuracies4 = []
    precisions3 = []
    precisions4 = []
    recalls3 = []
    recalls4 = []
    aucs3 = []
    aucs4 = []
    our_training_time1 = []
    our_testing_time1 = []
    their_training_time1 = []
    their_testing_time1 = []

    odtc = OurDecisionTreeClassifier()
    dtc = DecisionTreeClassifier()
    orf = OurRandomForrestClassifier(sample_size=0.3, n_estimators=11)
    rf = RandomForestClassifier(n_estimators=11)

    data_set = pandas.np.array(data_set)
    features_, labels_ = unzip_features_and_labels(data_set)

    for i in range(10):
        print("Iteration: ", i * 10)
        train_features, test_features, train_labels, test_labels = \
            train_test_split(
                features_, labels_,
                test_size=0.1,
                random_state=int(round(time.time()))
            )
        # un-numpy the arrays before predicting
        train_features, test_features, train_labels, test_labels = flatten_num_py_arrays(
            [train_features, test_features, train_labels, test_labels])
        # train and test our tree
        start_our_fit = time.time()
        odtc.fit(train_features, train_labels)
        end_our_fit = time.time()
        our_fit = end_our_fit - start_our_fit
        ops = time.time()
        prediction1 = odtc.predict(test_features)
        ope = time.time()
        our_predict = ope - ops
        # train and test their tree
        start_their_fit = time.time()
        dtc.fit(train_features, train_labels)
        end_their_fit = time.time()
        their_fit = end_their_fit - start_their_fit
        tps = time.time()
        prediction2 = dtc.predict(test_features)
        tpe = time.time()
        their_predict = tpe - tps

        a1, a2, p1, p2, r1, r2, auc1, auc2 = compare(prediction1, prediction2,
                                                     test_labels, labels_)

        accuracies1.append(a1)
        accuracies2.append(a2)
        precisions1.append(p1)
        precisions2.append(p2)
        recalls1.append(r1)
        recalls2.append(r2)
        aucs1.append(auc1)
        aucs2.append(auc2)
        our_training_time.append(our_fit)
        our_testing_time.append(our_predict)
        their_testing_time.append(their_predict)
        their_training_time.append(their_fit)

        if rf_flag:
            start_fitRF = time.time()
            orf.fit(train_features, train_labels)
            end_fitRF = time.time()
            our_fit1 = end_fitRF - start_fitRF
            start_predictRF = time.time()
            prediction3 = orf.predict(test_features)
            end_predictRF = time.time()
            our_predict1 = end_predictRF - start_predictRF
            start_their_fitRF = time.time()
            rf.fit(train_features, train_labels)
            end_their_fitRF = time.time()
            their_fit1 = end_their_fitRF - start_their_fitRF
            start_their_predictRF = time.time()
            prediction4 = rf.predict(test_features)
            end_their_predictRF = time.time()
            their_predict1 = end_their_predictRF - start_their_predictRF
            a3, a4, p3, p4, r3, r4, auc3, auc4 = compare(
                prediction3, prediction4, test_labels, labels_)
            accuracies3.append(a3)
            accuracies4.append(a4)
            precisions3.append(p3)
            precisions4.append(p4)
            recalls3.append(r3)
            recalls4.append(r4)
            aucs3.append(auc3)
            aucs4.append(auc4)
            our_training_time1.append(our_fit1)
            our_testing_time1.append(our_predict1)
            their_testing_time1.append(their_predict1)
            their_training_time1.append(their_fit1)

    print("\n----------------------------------------")
    print("\nFor our Decision Tree Classifier:")
    print("Our average accuracy:", numpy.array(accuracies1).mean())
    print("Our average precision:", numpy.array(precisions1).mean())
    print("Our average recall:", numpy.array(recalls1).mean())
    print("Our average AUC:", numpy.array(aucs1).mean())
    print("Our average training time:", numpy.array(our_training_time).mean())
    print("Our average testing time:", numpy.array(our_testing_time).mean())
    print("\n----------------------------------------")
    print("\nFor their Decision Tree Classifier:")
    print("Their average accuracy:", numpy.array(accuracies2).mean())
    print("Their average precision:", numpy.array(precisions2).mean())
    print("Their average recall:", numpy.array(recalls2).mean())
    print("Their average AUC:", numpy.array(aucs2).mean())
    print("Their average training time:",
          numpy.array(their_training_time).mean())
    print("Their average testing time:",
          numpy.array(their_testing_time).mean())

    if rf_flag:
        print("\n----------------------------------------")
        print("\nFor our Random Forest Classifier:")
        print("Our average accuracy:", numpy.array(accuracies3).mean())
        print("Our average precision:", numpy.array(precisions3).mean())
        print("Our average recall:", numpy.array(recalls3).mean())
        print("Our average AUC:", numpy.array(aucs3).mean())
        print("Our average training time:",
              numpy.array(our_training_time1).mean())
        print("Our average testing time:",
              numpy.array(our_testing_time1).mean())
        print("\n----------------------------------------")
        print("\nFor their Random Forest Classifier:")
        print("Their average accuracy:", numpy.array(accuracies4).mean())
        print("Their average precision:", numpy.array(precisions4).mean())
        print("Their average recall:", numpy.array(recalls4).mean())
        print("Their average AUC:", numpy.array(aucs4).mean())
        print("Their average training time:",
              numpy.array(their_training_time1).mean())
        print("Their average testing time:",
              numpy.array(their_testing_time1).mean())

    #w = wilcoxon(aucs2, aucs1)
    #w2 = wilcoxon(aucs1, aucs2)
    #print("\n", w)

    our_decision_tree_data = pandas.DataFrame(
        {
            'Our average accuracy': numpy.array(accuracies1).mean(),
            'Our average precision': numpy.array(precisions1).mean(),
            'Our average recall': numpy.array(recalls1).mean(),
            'Our average AUC': numpy.array(aucs1).mean(),
            'Our average training time': numpy.array(our_training_time).mean(),
            'Our average testing time': numpy.array(our_testing_time).mean()
        },
        index=[0])
    their_decison_tree_data = pandas.DataFrame(
        {
            'Their average accuracy': numpy.array(accuracies2).mean(),
            'Their average precision': numpy.array(precisions2).mean(),
            'Their average recall': numpy.array(recalls2).mean(),
            'Their average AUC': numpy.array(aucs2).mean(),
            'Their average training time':
            numpy.array(their_training_time).mean(),
            'Their average testing time':
            numpy.array(their_testing_time).mean()
        },
        index=[0])
    our_random_forest_data = pandas.DataFrame(
        {
            'Our average accuracy': numpy.array(accuracies3).mean(),
            'Our average precision': numpy.array(precisions3).mean(),
            'Our average recall': numpy.array(recalls3).mean(),
            'Our average AUC': numpy.array(aucs3).mean(),
            'Our average training time':
            numpy.array(our_training_time1).mean(),
            'Our average testing time': numpy.array(our_testing_time1).mean()
        },
        index=[0])
    their_random_forest_data = pandas.DataFrame(
        {
            'Their average accuracy': numpy.array(accuracies4).mean(),
            'Their average precision': numpy.array(precisions4).mean(),
            'Their average recall': numpy.array(recalls4).mean(),
            'Their average AUC': numpy.array(aucs4).mean(),
            'Their average training time':
            numpy.array(their_training_time1).mean(),
            'Their average testing time':
            numpy.array(their_testing_time1).mean()
        },
        index=[0])

    writer = pandas.ExcelWriter('diabetes.xlsx', engine='xlsxwriter')

    our_decision_tree_data.to_excel(writer, sheet_name='our dtc')
    their_decison_tree_data.to_excel(writer, sheet_name='their dtc')
    our_random_forest_data.to_excel(writer, sheet_name='our rf')
    their_random_forest_data.to_excel(writer, sheet_name='their rf')

    writer.save()
Beispiel #5
0
def simple_grid_search(data_set, file_name):
    data_set = pandas.np.array(data_set)
    features_, labels_ = unzip_features_and_labels(data_set)

    train_features, test_features, train_labels, test_labels = \
        train_test_split(
            features_, labels_,
            test_size=0.3,
            random_state=int(round(time.time()))
        )
    # un-numpy the arrays before predicting
    train_features, test_features, train_labels, test_labels = flatten_num_py_arrays(
        [train_features, test_features, train_labels, test_labels])

    algorithmResults = {}
    algorithmResults['ODTC'] = []
    algorithmResults['ORFC'] = []
    algorithmResults['DTC'] = []
    algorithmResults['RFC'] = []
    algorithmResults['KNN'] = []

    max_features_step = 2
    sample_leaf_step = 5

    result_files = {}
    result_files['ODTC'] = Excelifyer(use_column_headers=False)
    result_files['ORFC'] = Excelifyer(use_column_headers=False)
    result_files['DTC'] = Excelifyer(use_column_headers=False)
    result_files['RFC'] = Excelifyer(use_column_headers=False)
    result_files['KNN'] = Excelifyer(use_column_headers=False)

    for x in range(1, 10):
        max_features = x * max_features_step
        algorithmResults['ODTC'].append([])
        algorithmResults['ORFC'].append([])
        algorithmResults['DTC'].append([])
        algorithmResults['RFC'].append([])
        algorithmResults['KNN'].append([])
        if max_features >= len(train_features[0]):
            max_features = len(train_features[0]) - 1
        for y in range(1, 20):
            sample_leaf = y * sample_leaf_step
            odtc = OurDecisionTreeClassifier(max_features=max_features,
                                             min_sample_leaf=sample_leaf)
            orfc = OurRandomForrestClassifier(max_features=max_features,
                                              min_sample_leaf=sample_leaf,
                                              sample_size=0.3,
                                              n_estimators=11)
            dtc = DecisionTreeClassifier(max_features=max_features,
                                         min_samples_leaf=sample_leaf)
            rfc = RandomForestClassifier(max_features=max_features,
                                         min_samples_leaf=sample_leaf,
                                         n_estimators=11)
            knn = KNeighborsClassifier(leaf_size=sample_leaf)

            odtc.fit(train_features, train_labels)
            orfc.fit(train_features, train_labels)
            dtc.fit(train_features, train_labels)
            rfc.fit(train_features, train_labels.ravel())
            knn.fit(train_features, train_labels.ravel())

            our_prediction_dtc = odtc.predict(test_features)
            our_prediction_rfc = orfc.predict(test_features)
            their_prediction_dtc = dtc.predict(test_features)
            their_prediction_rfc = rfc.predict(test_features)
            knn_prediction = knn.predict(test_features)

            a_odtc = accuracy_test(our_prediction_dtc, test_labels)
            a_orfc = accuracy_test(our_prediction_rfc, test_labels)
            a_dtc = accuracy_test(their_prediction_dtc, test_labels)
            a_rfc = accuracy_test(their_prediction_rfc, test_labels)
            algorithmResults['ODTC'][x - 1].append(a_odtc)
            algorithmResults['ORFC'][x - 1].append(a_orfc)
            algorithmResults['DTC'][x - 1].append(a_dtc)
            algorithmResults['RFC'][x - 1].append(a_rfc)
            a_knn = accuracy_test(knn_prediction, test_labels)
            algorithmResults['KNN'][x - 1].append(a_knn)
            """
            result_files['ODTC'].at_cell(x - 1, y - 1, a_odtc)
            result_files['ORFC'].at_cell(x - 1, y - 1, a_orfc)
            result_files['DTC'].at_cell(x - 1, y - 1, a_dtc)
            result_files['RFC'].at_cell(x - 1, y - 1, a_rfc)
            result_files['KNN'].at_cell(x - 1, y - 1, a_knn)"""
            print(x, y)

    #for algorithm in result_files:
    #result_files[algorithm].to_excel('gridSearchResult' + file_name + algorithm + '.xlsx', sheet_name=algorithm)

    optimas = {}
    for algorithm in algorithmResults:
        algorithmOptmia = [0, 0]
        for x in range(len(algorithmResults[algorithm])):
            for y in range(len(algorithmResults[algorithm][x])):
                if algorithmResults[algorithm][x][y] > algorithmResults[
                        algorithm][algorithmOptmia[0]][algorithmOptmia[1]]:
                    algorithmOptmia = [x, y]
        optimas[algorithm] = [
            algorithmOptmia[0] * max_features_step,  #Max features optmia
            algorithmOptmia[1] * sample_leaf_step,  #Min sample leafs
            algorithmResults[algorithm][algorithmOptmia[0]][algorithmOptmia[1]]
        ]  #Accury value for the two parameter values
        print(algorithm, optimas[algorithm])

    best_alg = max(optimas.keys(), key=(lambda key: optimas[key][2]))
    print(best_alg)

    doc = Excelifyer(use_column_headers=False)
    doc.at_row(0, ' ',
               ['Algorithm', 'Max features', 'Min sample leafs', 'Accuracy'])
    rowIndex = 0
    for optima in optimas:
        arr = [
            optima, optimas[optima][0], optimas[optima][1], optimas[optima][2]
        ]
        print(arr)
        doc.at_row(rowIndex, ' ', arr)
        rowIndex += 1

    doc.to_excel('gridSearchRanking' + file_name + '.xlsx')