Example #1
0
def run_sfs_glass(filename, target_class):
    # Setup data
    glass_obj = glass.Glass()
    glass_data = glass_obj.setup_data_glass(filename=filename,
                                            target_class=target_class)

    # Split the data set into 2/3 and 1/3
    glass_data_train = glass_data.sample(frac=.667)
    glass_data_test = glass_data.drop(glass_data_train.index)

    # Create Naive Bayes for later comparison (Added in last, so that is why it is not greatly structured in the code)
    naive = nb.NaiveBayes()
    # Train classifier
    trainer_classifier = naive.naive_bayes_train(
        glass_data_train.iloc[:, 0:9], glass_data_train[target_class])
    # Test classifier
    test_classifier = naive.naive_bayes_test(glass_data_test.iloc[:, 0:9],
                                             trainer_classifier)
    # Get success rate back
    nb_perf = naive.compare_prediction(predict_classier=test_classifier,
                                       data=glass_data_test[target_class])

    # Setup SFS object
    stepwise_forward = sfs.SFS()

    # Setup SFS features to go through
    feature_set_columns = list()
    for feature in glass_data.columns:
        feature_set_columns.append(feature)
    feature_set_columns.remove(target_class)

    # Separate data for taining/testing purposes
    glass_data_train_target_class = glass_data_train[target_class]
    glass_data_train_features = glass_data_train.iloc[:, 0:9]
    glass_data_test_features = glass_data_test.iloc[:, 0:9]

    # Run SFS
    best_features = stepwise_forward.perform_sfs(
        feature_set_columns=feature_set_columns,
        d_train=glass_data_train_features,
        d_valid=glass_data_test_features,
        predicted=glass_data_train_target_class)

    # Return dataset with best features only
    best_data = glass_data[best_features]

    return best_features, best_data, glass_data, nb_perf
Example #2
0
def nb_iris(iris_data, target_class):
    # Split the data set into 2/3 and 1/3
    iris_data_train = iris_data.sample(frac=.667)
    iris_data_test = iris_data.drop(iris_data_train.index)

    # Create Naive Bayes for later comparison (Added in last, so that is why it is not greatly structured in the code)
    naive = nb.NaiveBayes()
    # Train classifier
    trainer_classifier = naive.naive_bayes_train(iris_data_train.iloc[:, 0:4],
                                                 iris_data_train[target_class])
    # Test classifier
    test_classifier = naive.naive_bayes_test(iris_data_test.iloc[:, 0:4],
                                             trainer_classifier)
    # Get success rate back
    nb_perf = naive.compare_prediction(predict_classier=test_classifier,
                                       data=iris_data_test[target_class])

    return nb_perf
Example #3
0
def main():
    # Print all output to file
    # Comment out for printing in console
    sys.stdout = open("./Assignment2Output.txt", "w")

    ##### Iris #####
    iris_target_class = "class"
    sfs_iris, sfs_best_data_iris, iris_data, iris_nb = run_sfs_iris(
        filename="data/iris.data", target_class=iris_target_class)
    print("Best Feature Selection of Iris: ")
    print(sfs_iris)
    print(sfs_best_data_iris)
    print()
    # Run k means with k=2 since my data setup normalizes data to be either 0 or 1 for the class we intend to have
    kmeans_iris = km.KMeans(num_classes=2, data=sfs_best_data_iris)
    iris_centroids, iris_clusters = kmeans_iris.kmeans_alg()

    print("Centroids for Iris: ")
    print(iris_centroids)
    print()
    print("Clusters for Iris: ")
    print(iris_clusters)
    print()

    # Get silhouette coefficient
    silhouette_coefficient_iris = kmeans_iris.silhouette_coefficient(
        iris_clusters)
    print("Silhouette coefficient for Iris: " +
          str(silhouette_coefficient_iris))
    print()

    # Get success rate for comparison
    compare_iris = nb.NaiveBayes()
    success_rate_iris = compare_iris.compare_prediction(
        iris_clusters, iris_data[iris_target_class])
    print("Success rate for Iris: " + str(success_rate_iris) + "%")
    print()
    print("Success rate for Iris Naive Bayes: " + str(iris_nb) + "%")
    print('\n' * 3)

    ##### Glass #####
    glass_target_class = "Type of glass"
    sfs_glass, sfs_best_data_glass, glass_data, glass_nb = run_sfs_glass(
        filename="data/glass.data", target_class=glass_target_class)
    print("Best Feature Selection of Glass: ")
    print(sfs_glass)
    print(sfs_best_data_glass)
    print()
    # Run k means with k=2 since my data setup normalizes data to be either 0 or 1 for the class we intend to have
    kmeans_glass = km.KMeans(num_classes=2, data=sfs_best_data_glass)
    glass_centroids, glass_clusters = kmeans_glass.kmeans_alg()

    print("Centroids for Glass: ")
    print(glass_centroids)
    print()
    print("Clusters for Glass: ")
    print(glass_clusters)
    print()

    # Get silhouette coefficient
    silhouette_coefficient_glass = kmeans_glass.silhouette_coefficient(
        glass_clusters)
    print("Silhouette coefficient for Glass: " +
          str(silhouette_coefficient_glass))
    print()

    # Get success rate for comparison
    compare_glass = nb.NaiveBayes()
    success_rate_glass = compare_glass.compare_prediction(
        glass_clusters, glass_data[glass_target_class])
    print("Success rate for Glass: " + str(success_rate_glass) + "%")
    print()
    print("Success rate for Glass Naive Bayes: " + str(glass_nb) + "%")
    print('\n' * 3)

    ##### Spambase #####
    spambase_target_class = "57"
    sfs_spambase, sfs_best_data_spambase, spambase_data, spambase_nb = run_sfs_spambase(
        filename="data/spambase.data", target_class=spambase_target_class)
    print("Best Feature Selection of Spambase: ")
    print(sfs_spambase)
    print(sfs_best_data_spambase)
    print()
    # Run k means with k=2 since my data setup normalizes data to be either 0 or 1 for the class we intend to have
    kmeans_spambase = km.KMeans(num_classes=2, data=sfs_best_data_spambase)
    spambase_centroids, spambase_clusters = kmeans_spambase.kmeans_alg()

    print("Centroids for Spambase: ")
    print(spambase_centroids)
    print()
    print("Clusters for Spambase: ")
    print(spambase_clusters)
    print()

    # Get silhouette coefficient
    silhouette_coefficient_spambase = kmeans_spambase.silhouette_coefficient(
        spambase_clusters)
    print("Silhouette coefficient for Spambase: " +
          str(silhouette_coefficient_spambase))
    print()

    # Get success rate for comparison
    compare_spambase = nb.NaiveBayes()
    success_rate_spambase = compare_spambase.compare_prediction(
        spambase_clusters, spambase_data[spambase_target_class])
    print("Success rate for Spambase: " + str(success_rate_spambase) + "%")
    print()
    print("Success rate for Spambase Naive Bayes: " + str(spambase_nb) + "%")
    print('\n' * 3)
Example #4
0
 def __init__(self):
     self.nb = nb.NaiveBayes()