def knn_and_naive(table): """ Analyzes the table based on Knn and Naive Bayes :param table: the table of the titanic dataset :return: nothing """ map_columns_table(table) table = knn.normalize_table(table, [5, 7]) # KNN output.printHeader('K-Nearest Neighbors') labels = partition_util.random_subsample_knn(table, 5, 10, constants.INDICES['salary']) accuracy = classifier_util.accuracy(labels) print('\tRandom Subsample') print('\t\tAccuracy = ' + str(accuracy) + ', error rate = ' + str(1 - accuracy)) labels = partition_util.stratified_cross_fold_knn( table, 5, 10, constants.INDICES['salary']) accuracy = classifier_util.accuracy(labels) accuracy_values.append(accuracy) print('\tStratified Cross Folds (5)') print('\t\tAccuracy = ' + str(accuracy) + ', error rate = ' + str(1 - accuracy)) _printConfusionMatrix(labels, 'Salary') # Naive Bayes output.printHeader('Naive Bayes') test_by_names = ['degree', 'ethnicity', 'gender'] accuracy = classifier_util.accuracy( partition_util.random_subsample_naive_bayes( table, 10, constants.INDICES['salary'], test_by_names)) print('\tRandom Subsample') print('\t\tAccuracy = ' + str(accuracy) + ', error rate = ' + str(1 - accuracy)) labels = partition_util.stratified_cross_fold_naive_bayes( table, 10, constants.INDICES['salary'], test_by_names) accuracy = classifier_util.accuracy(labels) accuracy_values.append(accuracy) print('\tStratified CrossFolding') print('\t\tAccuracy = ' + str(accuracy) + ', error rate = ' + str(1 - accuracy)) _printConfusionMatrix(labels, 'Salary')
def test_KNN(table, k): # Test KNN for several variations of K. output.update("... Testing KNN") logging.info("KNN report") start = time.time() labels = run_KNN(table, k) confusion_matrix(labels, 'score') logging.info('KNN at k=%s has %s accuracy in %s seconds' % (k, accuracy(labels), str(time.time() - start)))
def test_bayes(table): logging.info('\n# Testing Naive Bayes') discrete_table = discretize_table(table, [(1, 10), (2, 10), (3, 10), (4, 10), (5, 10)]) start = time.time() labels = run_bayes(discrete_table) output.update("... Running Naive Bayes") logging.info("Time in Seconds:" + str(time.time() - start) + "s") confusion_matrix(labels, 'score') logging.info("\nAccuracy:" + str(accuracy(labels)))
def randomforest(table, n, m, f): output.printHeader('Random Forest') print("N = " + str(n) + " M = " + str(m) + " F = " + str(f)) indexes = [INDICES['degree'], INDICES['ethnicity'], INDICES['gender']] domains = table_utils.get_domains(table, indexes) forest_labels, train, test = \ run_a_table(table, indexes, INDICES['salary'], n, m, f) forest_accurcay = accuracy(forest_labels) print('\tAccuracy = ' + str(forest_accurcay)) _printConfusionMatrix(forest_labels, 'Salary')
def test_forest(table, maxN): logging.info('\n# Testing Random Forest') toTabulate = [[ "Attempt Number", "N", "M", "F", "Accuracy", "Time in Seconds" ]] attempt = 1 for n in range(1, maxN): output.update("... Running Trees for n=%s" % n) for m in range(1, n): for f in range(1, 4): start = time.time() labels = run_forest(table, n, m, f) toTabulate.append([attempt, n, m, f, accuracy(labels), \ str(time.time() - start) + 's']) attempt += 1 logging.info( '\n' + str(tabulate(toTabulate, headers="firstrow", tablefmt="fancy")))
def decisiontree(table): map_columns_table(table) output.printHeader('Decision Tree') attributes = these(INDICES, 'ethnicity', 'degree', 'gender') domains = table_utils.get_domains(table, attributes) tree = decision_tree.tdidt(table, attributes, domains, INDICES['salary']) decision_tree.print_rules(tree, [ 'age', 'job-type', 'degree', 'marital-status', 'ethnicity', 'gender', 'country', 'salary' ], 'salary') attributes = these(INDICES, 'degree', 'ethnicity', 'gender') # Creates a myClassifier function that's paritally filled out # From decision_tree.classify # Essentially a new function: # myClassifier(training, test, class_index) myClassifier = partial(decision_tree.classify, att_indexes=attributes, att_domains=domains) labels = homework.stratified_cross_fold(table, 10, INDICES['salary'], myClassifier) acc = accuracy(labels) accuracy_values.append(acc) print('\n') print('Stratified CrossFolding') print('\tAccuracy = ' + str(acc) + ', error rate = ' + str(1 - acc)) print('\n') # Confusion Matrix _printConfusionMatrix(labels, 'Salary')
def _accuracy_for_tree(tree, class_index, test_set): labels = decision_tree.classify_with_tree(tree, class_index, test_set) return classifier_util.accuracy(labels)