Beispiel #1
0
def classify_separated_test_directory():

    print(
        "\nDo classification with different training directory and test directory"
    )
    print("\nTraining directory: " + config.get_record_dir())
    print("\nTest directory: " + SEPARATED_TEST_RECORD_DIR)

    file_contents, labels = input_parser.parse_input_files(
        config.get_record_dir(), combine_sc_vectors=True)
    test_data, test_labels = input_parser.parse_input_files(
        SEPARATED_TEST_RECORD_DIR, combine_sc_vectors=True)
    if len(file_contents) != len(test_data):
        raise ValueError(
            "Different number of input files in training directory and test directory - must be equal"
        )

    train_index = list(range(0, len(labels)))
    test_index = list(range(len(labels), len(labels) + len(test_labels)))

    # Append test data to training data
    for idx in range(0, len(file_contents)):
        training_file_content = file_contents[idx]
        test_file_content = test_data[idx]
        training_file_content.records.extend(test_file_content.records)
    labels.extend(test_labels)

    X = ft.extract_preconfigured_features(file_contents)
    Y = pd.Series(labels)

    Y_test, predictions, accuracy = app_classifier.do_classification(
        X, Y, train_index, test_index)
    print(classification_report(Y_test, predictions))
Beispiel #2
0
def get_used_side_channels(file_dict):

    used_side_channels = []
    if config.USE_TARGETED_SIDECHANNELS:
        print(
            "\nUse targeted side channels in target directory " +
            config.get_record_dir() + ": ", config.TARGETED_SIDECHANNELS)
        for side_channel in config.TARGETED_SIDECHANNELS:
            used_side_channels.append(get_file(file_dict, side_channel))
    else:
        print("\nUse " + str(len(file_dict)) +
              " side channels in target directory " + config.get_record_dir())
        for file_name, file_content in file_dict.items():
            print("Use side channel " + file_name)
            used_side_channels.append(file_content)

    return used_side_channels
Beispiel #3
0
def main():
    timing.start_measurement()

    print("Do combined classification using all input files")
    file_contents, labels = input_parser.parse_input_files(
        config.get_record_dir(), combine_sc_vectors=True)
    X = ft.extract_preconfigured_features(file_contents)
    Y = pd.Series(labels)
    app_classifier.do_kfold_cross_validation(X, Y)

    timing.stop_measurement()
def explorative_classification():

    file_contents, label_list = input_parser.parse_input_files(config.get_record_dir(), combine_sc_vectors=False)
    results = []

    for idx, fc in enumerate(file_contents):
        labels = label_list[idx]

        print("\nEvaluate ", fc.file_name)
        X = [fc]
        Y = pd.Series(labels)

        total_accuracy = app_classifier.do_kfold_cross_validation(X, Y, verbose=False)
        results.append(ClassificationResult(total_accuracy, fc.file_name))

    results.sort(key = lambda classificationResult: classificationResult.accuracy, reverse=True)

    print("\nSummary for files in " + config.get_record_dir() + ":\n")
    for r in results:
        print(r)
Beispiel #5
0
def main():
    timing.start_measurement()

    print("Do combined classification using all input files")
    file_contents, labels = input_parser.parse_input_files(
        config.get_record_dir(), combine_sc_vectors=True)
    X = ft.extract_preconfigured_features(file_contents)
    Y = pd.Series(labels)
    _, total_first_acc, total_second_acc, total_third_acc, total_single_accuracies = app_classifier.do_kfold_cross_validation(
        X, Y)

    total_acc = [total_first_acc, total_second_acc, total_third_acc]
    plt.plot(total_acc)
    plt.show()

    plt.plot(total_single_accuracies)
    plt.show()

    timing.stop_measurement()
def explorative_classification():
    file_contents, label_list = input_parser.parse_input_files(
        config.get_record_dir(), combine_sc_vectors=False)
    results = []
    results_first = []
    results_second = []
    results_third = []
    single_results = []

    # print("file content")
    # for idx, fc in enumerate(file_contents):
    #    print(str(idx) + " " + str(fc.file_name))

    # print("labellist")
    # print (label_list)

    for idx, fc in enumerate(file_contents):
        labels = label_list[idx]

        print("\nEvaluate ", fc.file_name)
        # print("labels")
        # print(labels)

        X = [fc]
        Y = pd.Series(labels)
        # print("Y")
        # print(Y)

        total_accuracy, total_first_acc, total_second_acc, total_third_acc, total_single_accuracies = app_classifier.do_kfold_cross_validation(
            X, Y, verbose=True, file_name=fc.file_name[:-4])
        results.append(
            ClassificationResult(round_float(total_accuracy), fc.file_name))
        results_first.append(
            ClassificationResult(round_float(total_first_acc), fc.file_name))
        results_second.append(
            ClassificationResult(round_float(total_second_acc), fc.file_name))
        results_third.append(
            ClassificationResult(round_float(total_third_acc), fc.file_name))
        single_results.append([])
        for total_single_accuracy in total_single_accuracies:
            single_results[idx].append(
                ClassificationResult(round_float(total_single_accuracy),
                                     fc.file_name))

    results.sort(
        key=lambda classification_result: classification_result.accuracy,
        reverse=True)
    results_first.sort(
        key=lambda classificationResult: classificationResult.accuracy,
        reverse=True)
    results_second.sort(
        key=lambda classificationResult: classificationResult.accuracy,
        reverse=True)
    results_third.sort(
        key=lambda classificationResult: classificationResult.accuracy,
        reverse=True)
    # for single_result in single_results:
    #   single_result.sort(key=lambda classification_result: classification_result.accuracy, reverse=True)

    print("\nSummary for files in " + config.get_record_dir() + ":\n")
    for r in results:
        print(r)
    print("\nSummary of first for files in " + config.get_record_dir() + ":\n")
    for r in results_first:
        print(r)
    print("\nSummary of second for files in " + config.get_record_dir() +
          ":\n")
    for r in results_second:
        print(r)
    print("\nSummary of third for files in " + config.get_record_dir() + ":\n")
    for r in results_third:
        print(r)

    for single_result in zip(single_results):
        if not os.path.exists(config.RECORD_BASE_DIR +
                              config.get_record_dir() + config.RESULTS_DIR):
            os.makedirs(config.RECORD_BASE_DIR + config.get_record_dir() +
                        config.RESULTS_DIR)
        # print("\nSummary of for files in " + config.get_record_dir() + ":\n")
        for r_1 in single_result:
            file = open(
                config.RECORD_BASE_DIR + config.get_record_dir() +
                config.RESULTS_DIR + r_1[0].file_name, "w")
            for idx, r in enumerate(r_1):
                file.write(str(idx + 1) + ", " + str(r.accuracy) + "\n")
            file.close()
Beispiel #7
0
def do_kfold_cross_validation(X, Y, verbose=True, file_name=""):
    folds = config.FOLDS
    printv("\nSelecting rows for " + str(folds) + "-fold validation", verbose)
    kf = StratifiedKFold(n_splits=folds, shuffle=True)
    kf.get_n_splits()

    # Initialize classification performance measures
    unique_labels = Y.unique()
    cnf_mat = pd.DataFrame(np.zeros((len(unique_labels), len(unique_labels))), columns=unique_labels)
    # print("-------------------------- cnf_mat --------------------------")
    # print(cnf_mat)
    cnf_mat.set_index(keys=unique_labels, inplace=True)
    # print("-------------------------- cnf_mat 2 --------------------------")
    # print(cnf_mat)
    Y_test_all_folds = []
    predictions_all_folds = []
    summed_accuracy = 0
    summed_first_acc = 0
    summed_second_acc = 0
    summed_third_acc = 0
    summed_single_accuracies = []

    fold_cnt = 1
    firstFileContent = X[0]
    split_var = firstFileContent.records

    dist_matrices = precomputed_knn_selector.init_dist_matrices(X)

    # print("-------------------------- dist_matrices --------------------------")
    # print(dist_matrices)

    for train_indices, test_indices in kf.split(split_var, Y):

        printv("\nFold: " + str(fold_cnt), verbose)

        Y_test, predictions, accuracy, acc_first, acc_second, acc_third, single_accuracies = do_classification(X, Y,
                                                                                                               train_indices,
                                                                                                               test_indices,
                                                                                                               dist_matrices)

        if verbose:
            for idx, pred in enumerate(predictions):
                cnf_mat.ix[Y_test.iloc[idx], pred] += 1

        printv("Accuracy:" + str(accuracy), verbose)
        summed_accuracy += accuracy
        summed_first_acc += acc_first
        summed_second_acc += acc_second
        summed_third_acc += acc_third
        for idx, single_accuracy in enumerate(single_accuracies):
            if idx >= len(summed_single_accuracies):
                summed_single_accuracies.append(single_accuracy)
            else:
                summed_single_accuracies[idx] += single_accuracy
        fold_cnt += 1

        Y_test_all_folds.extend(Y_test.values.tolist())
        predictions_all_folds.extend(predictions.values.tolist())

    total_accuracy = summed_accuracy / folds
    total_first_acc = summed_first_acc / folds
    total_second_acc = summed_second_acc / folds
    total_third_acc = summed_third_acc / folds
    total_single_accuracies = []
    for idx, summed_single_accuracy in enumerate(summed_single_accuracies):
        total_single_accuracies.append(summed_single_accuracy / folds)

    if verbose:
        classification_rep = classification_report(Y_test_all_folds, predictions_all_folds)
        printv(classification_rep, verbose)
        if not os.path.exists(config.RECORD_BASE_DIR + config.get_record_dir() + config.RESULTS_DIR):
            os.makedirs(config.RECORD_BASE_DIR + config.get_record_dir() + config.RESULTS_DIR)
        file = open(config.RECORD_BASE_DIR + config.get_record_dir() + config.RESULTS_DIR + file_name +
                    "_classification_report.txt", "w")
        file.write(classification_rep)
        file.close()

    print("\nTotal accuracy over all folds: " + str(total_accuracy))
    print("Total 1st accuracy over all folds: " + str(total_first_acc))
    print("Total 2nd accuracy over all folds: " + str(total_second_acc))
    print("Total 3rd accuracy over all folds: " + str(total_third_acc))
    print("Total single accuracies over all folds:", total_single_accuracies)

    if verbose:
        # plot_confusion_matrix.show_confusion_matrix(cnf_mat.values.astype(int), unique_labels)
        if not os.path.exists(config.RECORD_BASE_DIR + config.get_record_dir() + config.RESULTS_DIR):
            os.makedirs(config.RECORD_BASE_DIR + config.get_record_dir() + config.RESULTS_DIR)
        # print("\nSummary of for files in " + config.get_record_dir() + ":\n")
        cnf_mat.to_csv(config.RECORD_BASE_DIR + config.get_record_dir() + config.RESULTS_DIR + file_name +
                       "_confusion_matrix.txt", sep=' ')

    return total_accuracy, total_first_acc, total_second_acc, total_third_acc, total_single_accuracies