def compare_naive_bayes(training, testing, pca_training, pca_testing): util.print_line_break() print "Comparing Naive Bayes accuracy with and without PCA" print "Without PCA:" print NaiveBayes(training).classify_all(testing).compute_accuracy() print "With PCA" print NaiveBayes(pca_training).classify_all(pca_testing).compute_accuracy()
def compare_knn(training, testing, pca_training, pca_testing): util.print_line_break() print "Comparing KNN accuracy with and without PCA" print "Without PCA:" print Knn(training, k=3).classify_all(testing).compute_accuracy() print "With PCA" print Knn(pca_training, k=3).classify_all(pca_testing).compute_accuracy()
def pca_find_important_features(dataset): # The weight matrix that is used to transform the original data # to the reduced data can be used to see which features are most # important. The features with the largest magnitude weight have # the largest impact on the reduced data. util.print_line_break() print "First principal component impacts (absolute value of weight):" print pca(dataset, 2).get_first_component_impacts()
def decision_tree_accuracy_tests(training, testing): util.print_line_break() print "Decision tree accuracy test:" # bin grades of 0-3 as low, 4-6 as mid, 7-9 as high training.bin("*", [4, 7], bin_names=["low", "mid", "high"]) testing.bin("*", [4, 7], bin_names=["low", "mid", "high"]) accuracy = DecisionTree(training).classify_all(testing).compute_accuracy() print "%2.5f %%" % (100 * accuracy)
def knn_accuracy_tests(training, testing): util.print_line_break() print "KNN accuracy test:" print "k\tAccuracy" print "-\t--------" # Test the accuracy for k values of 3, 4, 5, 6, 7, 8, 9 for k in range(3, 10): accuracy = Knn(training, k=k).classify_all(testing).compute_accuracy() print "%d\t%2.5f %%" % (k, 100 * accuracy)
def main(): # The original data set. data = util.load_data() # Fill in missing values with the average for that course. data.fill_missing_with_feature_means() cluster_3_groups(data.copy()) cluster_pass_fail(data.copy()) cluster_success_struggle(data.copy()) util.print_line_break() print "Now with PCA:" cluster_3_groups_with_pca(data.copy()) cluster_pass_fail_with_pca(data.copy()) cluster_success_struggle_with_pca(data.copy())
def main(): # The original data set. data = util.load_data() # Fill in missing values with the average for that course. data.fill_missing_with_feature_means() # Count successful and probation students as one group (s) # Comment this out to try and distinguish all 3 groups (s, p, f) data.combine_labels(["s", "p"], "s") util.print_line_break() print "Without PCA: %.5f" % get_knn_accuracy(data) util.print_line_break() print "With PCA:" print "\t".join(["PCs", "Accuracy"]) for num_components in range(1, data.num_features()): accuracy = get_knn_accuracy(pca(data, num_components)) print "%d\t%.5f" % (num_components, accuracy)
def naive_bayes_accuracy_tests(training, testing): util.print_line_break() print "Naive Bayes accuracy test:" accuracy = NaiveBayes(training).classify_all(testing).compute_accuracy() print "%2.5f %%" % (100 * accuracy)