Esempio n. 1
0
def run_id3_decision_tree(df, prune=False):
    # Split dataset 5-fold stratified
    print(f"Size of total dataset = {len(df)}")
    train1, train2, train3, train4, train5 = split_into_random_stratified_groups(
        df)
    datasets = [train1, train2, train3, train4, train5]
    scores = []
    pruned_scores = []
    for i, d in enumerate(datasets):
        print("-------------")
        print(f"Experiment #{i + 1}")
        print("-------------")

        # Use one subset as a test set
        df_test = datasets[i]
        print(f"Test set size = {len(df_test)}")
        training_sets = datasets.copy()

        # Create a training set from remaining subsets
        del training_sets[i]
        df_train = pd.concat(training_sets)
        print(f"Training set size = {len(df_train)}")

        # Build the decision tree from the training set
        id3 = DecisionTree(df_train)
        id3.build_id3_tree()
        #id3.print_tree()

        # Test the decision tree
        accuracy = id3.validate(id3.root, df_test)
        print('Percent accurate: ' + repr(accuracy) + '%')
        scores.append(accuracy)

        # If pruning is turned on, test pruned tree accuracy
        if prune:
            p_accuracy = id3.validate_pruned_tree(df_test)
            print('Pruned Tree Percent Accurate: ' + repr(p_accuracy) + '%')
            pruned_scores.append(p_accuracy)

    print("----------------------------")
    print(f"Averages over 5 experiments")
    print("----------------------------")
    print(f"ID3 Decision Tree Averages = {statistics.mean(scores)}%")
    if prune:
        print(
            f"Pruned ID3 Decision Tree Averages = {statistics.mean(pruned_scores)}%"
        )
Esempio n. 2
0
def run_k_nearest_neighbor_experiments(df,
                                       k,
                                       run_condensed,
                                       classification=True):
    # Split dataset 5-fold stratified
    print(f"Size of total dataset = {len(df)}")
    train1, train2, train3, train4, train5 = split_into_random_stratified_groups(
        df)
    # Run five experiments, using one of the sets as a test set each time
    k_scores = []
    k_condensed_scores = []
    datasets = [train1, train2, train3, train4, train5]
    for i, d in enumerate(datasets):
        print("-------------")
        print(f"Experiment #{i + 1}")
        print("-------------")
        df_test = datasets[i]
        print(len(df_test))
        training_sets = datasets.copy()
        del training_sets[i]
        df_train = pd.concat(training_sets)
        print(len(df_train))

        # Run K-Nearest Neighbors
        print(f"k = {k}")
        print("Running k nearest neighbors...")
        knn = KNearestNeighbors(df_test, k, df.columns, classification)
        accuracy = knn.run(df_train)
        print('Percent accurate: ' + repr(accuracy) + '%')
        k_scores.append(accuracy)

        if run_condensed:
            # Run Condensed K-Nearest Neighbors
            knn = KNearestNeighbors(df_test, k, df.columns, classification)
            accuracy = knn.run_condensed(df_train)
            print('Percent accurate: ' + repr(accuracy) + '%')
            k_condensed_scores.append(accuracy)

    print("----------------------------------------")
    print(f"Averages over 5 experiments where k={k}")
    print("----------------------------------------")
    print(f"k-Nearest Neighbors = {statistics.mean(k_scores)}")
    if run_condensed:
        print(
            f"Condensed k-Nearest Neighbors = {statistics.mean(k_condensed_scores)}"
        )
Esempio n. 3
0
def run_backpropagation(df, num_features, num_hidden):
    """
    This function runs a backpropagation neural network on the data frame and outputs statistics from five experiments
    :param df: The data set to run the algorithm on=
    :param num_features: The number of features in this dataset
    """
    # Split dataset 5-fold stratified
    print(f"Size of total dataset = {len(df)}")
    train1, train2, train3, train4, train5 = split_into_random_stratified_groups(
        df)
    datasets = [train1, train2, train3, train4, train5]
    lg_scores = []
    for i, d in enumerate(datasets):
        print("-------------")
        print(f"Experiment #{i + 1}")
        print("-------------")

        # Use one subset as a test set
        df_test = datasets[i]
        print(f"Test set size = {len(df_test)}")
        training_sets = datasets.copy()

        # Create a training set from remaining subsets
        del training_sets[i]
        df_train = pd.concat(training_sets)
        print(f"Training set size = {len(df_train)}")

        # Create Logistic Regression
        print(df_train.iloc[:, 0:num_features + 1].head())
        lg = BackpropagationNeuralNetwork(df_train.columns[0:num_features],
                                          df_train.iloc[:, 0:num_features + 1],
                                          df_train.iloc[:, num_features],
                                          df_test.iloc[:, 0:num_features + 1],
                                          df_test.iloc[:, num_features],
                                          int(num_hidden))

        # Train with logistic regression
        lg.learn()

        # Test the logistic regression accuracy
        lg_accuracy = lg.make_predictions()
        print('Percent accurate: ' + repr(lg_accuracy) + '%')
        lg_scores.append(lg_accuracy)

    return statistics.mean(lg_scores)
Esempio n. 4
0
def run_naive_bayes(df, num_features):
    """
        This function runs naive on the data frame and outputs statistics from five experiments
        :param df: The data set to run the algorithm on=
        :param num_features: The number of features in this dataset
        """
    # Split dataset 5-fold stratified
    print(f"Size of total dataset = {len(df)}")
    train1, train2, train3, train4, train5 = split_into_random_stratified_groups(
        df)
    datasets = [train1, train2, train3, train4, train5]
    nb_scores = []
    for i, d in enumerate(datasets):
        print("-------------")
        print(f"Experiment #{i + 1}")
        print("-------------")

        # Use one subset as a test set
        df_test = datasets[i]
        print(f"Test set size = {len(df_test)}")
        training_sets = datasets.copy()

        # Create a training set from remaining subsets
        del training_sets[i]
        df_train = pd.concat(training_sets)
        print(f"Training set size = {len(df_train)}")

        # Create Naive Bayes
        nb = NaiveBayes(df_train.iloc[:, 0:num_features],
                        df_train.iloc[:, num_features],
                        df_test.iloc[:, 0:num_features],
                        df_test.iloc[:, num_features])

        # Train with naive bayes
        nb.learn()

        # Test the accuracy of naive bayes
        nb_accuracy = nb.validate()
        print('Naive Bayes Percent accurate: ' + repr(nb_accuracy) + '%')
        nb_scores.append(nb_accuracy)

    return statistics.mean(nb_scores)