Beispiel #1
0
def randomforest(table, n, m, f):
    output.printHeader('Random Forest')
    print("N = " + str(n) + " M = " + str(m) + " F = " + str(f))
    indexes = [INDICES['degree'], INDICES['ethnicity'], INDICES['gender']]
    domains = table_utils.get_domains(table, indexes)
    forest_labels, train, test = \
                run_a_table(table, indexes,
                    INDICES['salary'], n, m, f)
    forest_accurcay = accuracy(forest_labels)

    print('\tAccuracy = ' + str(forest_accurcay))
    _printConfusionMatrix(forest_labels, 'Salary')
Beispiel #2
0
def knn_and_naive(table):
    """ Analyzes the table based on Knn and Naive Bayes

    :param table: the table of the titanic dataset
    :return: nothing
    """
    map_columns_table(table)
    table = knn.normalize_table(table, [5, 7])

    # KNN
    output.printHeader('K-Nearest Neighbors')

    labels = partition_util.random_subsample_knn(table, 5, 10,
                                                 constants.INDICES['salary'])
    accuracy = classifier_util.accuracy(labels)
    print('\tRandom Subsample')
    print('\t\tAccuracy = ' + str(accuracy) + ', error rate = ' +
          str(1 - accuracy))

    labels = partition_util.stratified_cross_fold_knn(
        table, 5, 10, constants.INDICES['salary'])

    accuracy = classifier_util.accuracy(labels)
    accuracy_values.append(accuracy)
    print('\tStratified Cross Folds (5)')
    print('\t\tAccuracy = ' + str(accuracy) + ', error rate = ' +
          str(1 - accuracy))

    _printConfusionMatrix(labels, 'Salary')

    # Naive Bayes
    output.printHeader('Naive Bayes')
    test_by_names = ['degree', 'ethnicity', 'gender']

    accuracy = classifier_util.accuracy(
        partition_util.random_subsample_naive_bayes(
            table, 10, constants.INDICES['salary'], test_by_names))

    print('\tRandom Subsample')
    print('\t\tAccuracy = ' + str(accuracy) + ', error rate = ' +
          str(1 - accuracy))

    labels = partition_util.stratified_cross_fold_naive_bayes(
        table, 10, constants.INDICES['salary'], test_by_names)
    accuracy = classifier_util.accuracy(labels)
    accuracy_values.append(accuracy)
    print('\tStratified CrossFolding')
    print('\t\tAccuracy = ' + str(accuracy) + ', error rate = ' +
          str(1 - accuracy))
    _printConfusionMatrix(labels, 'Salary')
Beispiel #3
0
def main():
    # Data preprocessing
    newTable = file_system.loadTable("../datasets/income.csv")
    removedRowsTable = clean.removeNA(newTable)
    incomeDataFullNoNA = file_system.write(removedRowsTable,
                                           "../datasets/incomeNoNA.csv")
    output.printHeader('Rows with NAs have been removed.')

    # Data visualization
    data_vis()
    output.printHeader('Data visualization complete.')

    # KNN and Naive Bayes classifiers
    table = file_system.loadTable('../datasets/incomeNoNA.csv')
    knn_and_naive(table)

    # Decision Tree classifier
    table = file_system.loadTable('../datasets/incomeNoNA.csv')
    decisiontree(table)

    # Random Forest classifier
    table = file_system.loadTable('../datasets/incomeNoNA.csv')
    randomforest(table, 3000, 215, 2)  #N, M, and F vals
Beispiel #4
0
def decisiontree(table):
    map_columns_table(table)

    output.printHeader('Decision Tree')

    attributes = these(INDICES, 'ethnicity', 'degree', 'gender')
    domains = table_utils.get_domains(table, attributes)
    tree = decision_tree.tdidt(table, attributes, domains, INDICES['salary'])

    decision_tree.print_rules(tree, [
        'age', 'job-type', 'degree', 'marital-status', 'ethnicity', 'gender',
        'country', 'salary'
    ], 'salary')

    attributes = these(INDICES, 'degree', 'ethnicity', 'gender')

    # Creates a myClassifier function that's paritally filled out
    # From decision_tree.classify
    # Essentially a new function:
    # myClassifier(training, test, class_index)
    myClassifier = partial(decision_tree.classify,
                           att_indexes=attributes,
                           att_domains=domains)

    labels = homework.stratified_cross_fold(table, 10, INDICES['salary'],
                                            myClassifier)

    acc = accuracy(labels)
    accuracy_values.append(acc)
    print('\n')
    print('Stratified CrossFolding')
    print('\tAccuracy = ' + str(acc) + ', error rate = ' + str(1 - acc))
    print('\n')

    # Confusion Matrix
    _printConfusionMatrix(labels, 'Salary')
Beispiel #5
0
def _printConfusionMatrix(labels, name):
    """ Prints a confusion matrix for given labels """
    output.printHeader('Confusion Matrix')
    partition_util.print_confusion_matrix(labels, name)