Beispiel #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-f",
        "--filename",
        type=str,
        help="Path to file with data",
    )

    parser.add_argument(
        "-ts",
        "--train-size",
        type=float,
        help="Percentage of dataset used in the train set.",
    )

    parser.add_argument(
        "-srt",
        "--sort-train-set",
        action="store_true",
        help="Sorts train set after spliting dataset to train and test set",
    )

    args = parser.parse_args()

    _filename = args.filename
    _train_size = args.train_size
    _sort_train_set = args.sort_train_set

    dataset = load_data_from_file(_filename)

    # uncomment to check grouping classes by two chosen features
    # plot_labels_by_two_features(X1, X2, dataset)

    results = pd.DataFrame()
    train_set, test_set = prepare_train_and_test_set(dataset, _train_size,
                                                     _sort_train_set)

    classes = get_test_set_classes(test_set)

    (
        mean_values,
        std_variation_values,
        prior_probabilities,
    ) = calculate_statistics_by_class(train_set)

    results = results.append(
        test_set_classification(test_set, mean_values, std_variation_values,
                                prior_probabilities))

    confusion_matrix = ConfusionMatrix(results, classes)
    confusion_matrix.plot_confusion_matrix()

    confusion_matrices = []
    for i in range(10):
        train_set, test_set = prepare_train_and_test_set(
            dataset, _train_size, _sort_train_set)

        classes = get_test_set_classes(test_set)

        (
            mean_values,
            std_variation_values,
            prior_probabilities,
        ) = calculate_statistics_by_class(train_set)

        results = test_set_classification(test_set, mean_values,
                                          std_variation_values,
                                          prior_probabilities)

        cm = ConfusionMatrix(results, classes)
        cm.calculate_metrics_by_classes()
        confusion_matrices.append(cm)

    accuracy_values = []
    precision_values = {}
    recall_values = {}
    for matrix in confusion_matrices:
        accuracy_values.append(matrix.accuracy)
        for class_name in matrix.metrics:
            if class_name not in precision_values:
                precision_values[class_name] = []
                recall_values[class_name] = []
            precision_values[class_name].append(
                matrix.metrics[class_name]["precision"])
            recall_values[class_name].append(
                matrix.metrics[class_name]["recall"])

    print(
        f"Classifier overall accuracy statistics after 10 runs:\nMin: {min(accuracy_values)}\nMax: {max(accuracy_values)}\n"
        +
        f"Mean: {np.mean(accuracy_values)}\nStd deviation: {np.std(accuracy_values)}\n"
    )
    for class_name in precision_values:
        print(
            f"{class_name} precision statistics after 10 runs:\nMin: {min(precision_values[class_name])}\nMax: {max(precision_values[class_name])}\n"
            +
            f"Mean: {np.mean(precision_values[class_name])}\nStd deviation: {np.std(precision_values[class_name])}\n"
        )
        print(
            f"{class_name} recall statistics after 10 runs:\nMin: {min(recall_values[class_name])}\nMax: {max(recall_values[class_name])}\n"
            +
            f"Mean: {np.mean(recall_values[class_name])}\nStd deviation: {np.std(recall_values[class_name])}\n"
        )