def randomforest(table, n, m, f): output.printHeader('Random Forest') print("N = " + str(n) + " M = " + str(m) + " F = " + str(f)) indexes = [INDICES['degree'], INDICES['ethnicity'], INDICES['gender']] domains = table_utils.get_domains(table, indexes) forest_labels, train, test = \ run_a_table(table, indexes, INDICES['salary'], n, m, f) forest_accurcay = accuracy(forest_labels) print('\tAccuracy = ' + str(forest_accurcay)) _printConfusionMatrix(forest_labels, 'Salary')
def knn_and_naive(table): """ Analyzes the table based on Knn and Naive Bayes :param table: the table of the titanic dataset :return: nothing """ map_columns_table(table) table = knn.normalize_table(table, [5, 7]) # KNN output.printHeader('K-Nearest Neighbors') labels = partition_util.random_subsample_knn(table, 5, 10, constants.INDICES['salary']) accuracy = classifier_util.accuracy(labels) print('\tRandom Subsample') print('\t\tAccuracy = ' + str(accuracy) + ', error rate = ' + str(1 - accuracy)) labels = partition_util.stratified_cross_fold_knn( table, 5, 10, constants.INDICES['salary']) accuracy = classifier_util.accuracy(labels) accuracy_values.append(accuracy) print('\tStratified Cross Folds (5)') print('\t\tAccuracy = ' + str(accuracy) + ', error rate = ' + str(1 - accuracy)) _printConfusionMatrix(labels, 'Salary') # Naive Bayes output.printHeader('Naive Bayes') test_by_names = ['degree', 'ethnicity', 'gender'] accuracy = classifier_util.accuracy( partition_util.random_subsample_naive_bayes( table, 10, constants.INDICES['salary'], test_by_names)) print('\tRandom Subsample') print('\t\tAccuracy = ' + str(accuracy) + ', error rate = ' + str(1 - accuracy)) labels = partition_util.stratified_cross_fold_naive_bayes( table, 10, constants.INDICES['salary'], test_by_names) accuracy = classifier_util.accuracy(labels) accuracy_values.append(accuracy) print('\tStratified CrossFolding') print('\t\tAccuracy = ' + str(accuracy) + ', error rate = ' + str(1 - accuracy)) _printConfusionMatrix(labels, 'Salary')
def main(): # Data preprocessing newTable = file_system.loadTable("../datasets/income.csv") removedRowsTable = clean.removeNA(newTable) incomeDataFullNoNA = file_system.write(removedRowsTable, "../datasets/incomeNoNA.csv") output.printHeader('Rows with NAs have been removed.') # Data visualization data_vis() output.printHeader('Data visualization complete.') # KNN and Naive Bayes classifiers table = file_system.loadTable('../datasets/incomeNoNA.csv') knn_and_naive(table) # Decision Tree classifier table = file_system.loadTable('../datasets/incomeNoNA.csv') decisiontree(table) # Random Forest classifier table = file_system.loadTable('../datasets/incomeNoNA.csv') randomforest(table, 3000, 215, 2) #N, M, and F vals
def decisiontree(table): map_columns_table(table) output.printHeader('Decision Tree') attributes = these(INDICES, 'ethnicity', 'degree', 'gender') domains = table_utils.get_domains(table, attributes) tree = decision_tree.tdidt(table, attributes, domains, INDICES['salary']) decision_tree.print_rules(tree, [ 'age', 'job-type', 'degree', 'marital-status', 'ethnicity', 'gender', 'country', 'salary' ], 'salary') attributes = these(INDICES, 'degree', 'ethnicity', 'gender') # Creates a myClassifier function that's paritally filled out # From decision_tree.classify # Essentially a new function: # myClassifier(training, test, class_index) myClassifier = partial(decision_tree.classify, att_indexes=attributes, att_domains=domains) labels = homework.stratified_cross_fold(table, 10, INDICES['salary'], myClassifier) acc = accuracy(labels) accuracy_values.append(acc) print('\n') print('Stratified CrossFolding') print('\tAccuracy = ' + str(acc) + ', error rate = ' + str(1 - acc)) print('\n') # Confusion Matrix _printConfusionMatrix(labels, 'Salary')
def _printConfusionMatrix(labels, name): """ Prints a confusion matrix for given labels """ output.printHeader('Confusion Matrix') partition_util.print_confusion_matrix(labels, name)