Example #1
0
def knn_and_naive(table):
    """ Analyzes the table based on Knn and Naive Bayes

    :param table: the table of the titanic dataset
    :return: nothing
    """
    map_columns_table(table)
    table = knn.normalize_table(table, [5, 7])

    # KNN
    output.printHeader('K-Nearest Neighbors')

    labels = partition_util.random_subsample_knn(table, 5, 10,
                                                 constants.INDICES['salary'])
    accuracy = classifier_util.accuracy(labels)
    print('\tRandom Subsample')
    print('\t\tAccuracy = ' + str(accuracy) + ', error rate = ' +
          str(1 - accuracy))

    labels = partition_util.stratified_cross_fold_knn(
        table, 5, 10, constants.INDICES['salary'])

    accuracy = classifier_util.accuracy(labels)
    accuracy_values.append(accuracy)
    print('\tStratified Cross Folds (5)')
    print('\t\tAccuracy = ' + str(accuracy) + ', error rate = ' +
          str(1 - accuracy))

    _printConfusionMatrix(labels, 'Salary')

    # Naive Bayes
    output.printHeader('Naive Bayes')
    test_by_names = ['degree', 'ethnicity', 'gender']

    accuracy = classifier_util.accuracy(
        partition_util.random_subsample_naive_bayes(
            table, 10, constants.INDICES['salary'], test_by_names))

    print('\tRandom Subsample')
    print('\t\tAccuracy = ' + str(accuracy) + ', error rate = ' +
          str(1 - accuracy))

    labels = partition_util.stratified_cross_fold_naive_bayes(
        table, 10, constants.INDICES['salary'], test_by_names)
    accuracy = classifier_util.accuracy(labels)
    accuracy_values.append(accuracy)
    print('\tStratified CrossFolding')
    print('\t\tAccuracy = ' + str(accuracy) + ', error rate = ' +
          str(1 - accuracy))
    _printConfusionMatrix(labels, 'Salary')
Example #2
0
def test_KNN(table, k):

    # Test KNN for several variations of K.
    output.update("... Testing KNN")
    logging.info("KNN report")
    start = time.time()
    labels = run_KNN(table, k)
    confusion_matrix(labels, 'score')
    logging.info('KNN at k=%s has %s accuracy in %s seconds' %
                 (k, accuracy(labels), str(time.time() - start)))
Example #3
0
def test_bayes(table):
    logging.info('\n# Testing Naive Bayes')

    discrete_table = discretize_table(table, [(1, 10), (2, 10), (3, 10),
                                              (4, 10), (5, 10)])

    start = time.time()
    labels = run_bayes(discrete_table)
    output.update("... Running Naive Bayes")
    logging.info("Time in Seconds:" + str(time.time() - start) + "s")
    confusion_matrix(labels, 'score')
    logging.info("\nAccuracy:" + str(accuracy(labels)))
Example #4
0
def randomforest(table, n, m, f):
    output.printHeader('Random Forest')
    print("N = " + str(n) + " M = " + str(m) + " F = " + str(f))
    indexes = [INDICES['degree'], INDICES['ethnicity'], INDICES['gender']]
    domains = table_utils.get_domains(table, indexes)
    forest_labels, train, test = \
                run_a_table(table, indexes,
                    INDICES['salary'], n, m, f)
    forest_accurcay = accuracy(forest_labels)

    print('\tAccuracy = ' + str(forest_accurcay))
    _printConfusionMatrix(forest_labels, 'Salary')
Example #5
0
def test_forest(table, maxN):
    logging.info('\n# Testing Random Forest')

    toTabulate = [[
        "Attempt Number", "N", "M", "F", "Accuracy", "Time in Seconds"
    ]]
    attempt = 1

    for n in range(1, maxN):
        output.update("... Running Trees for n=%s" % n)
        for m in range(1, n):
            for f in range(1, 4):
                start = time.time()
                labels = run_forest(table, n, m, f)
                toTabulate.append([attempt, n, m, f, accuracy(labels), \
                    str(time.time() - start) + 's'])
                attempt += 1

    logging.info(
        '\n' + str(tabulate(toTabulate, headers="firstrow", tablefmt="fancy")))
Example #6
0
def decisiontree(table):
    map_columns_table(table)

    output.printHeader('Decision Tree')

    attributes = these(INDICES, 'ethnicity', 'degree', 'gender')
    domains = table_utils.get_domains(table, attributes)
    tree = decision_tree.tdidt(table, attributes, domains, INDICES['salary'])

    decision_tree.print_rules(tree, [
        'age', 'job-type', 'degree', 'marital-status', 'ethnicity', 'gender',
        'country', 'salary'
    ], 'salary')

    attributes = these(INDICES, 'degree', 'ethnicity', 'gender')

    # Creates a myClassifier function that's paritally filled out
    # From decision_tree.classify
    # Essentially a new function:
    # myClassifier(training, test, class_index)
    myClassifier = partial(decision_tree.classify,
                           att_indexes=attributes,
                           att_domains=domains)

    labels = homework.stratified_cross_fold(table, 10, INDICES['salary'],
                                            myClassifier)

    acc = accuracy(labels)
    accuracy_values.append(acc)
    print('\n')
    print('Stratified CrossFolding')
    print('\tAccuracy = ' + str(acc) + ', error rate = ' + str(1 - acc))
    print('\n')

    # Confusion Matrix
    _printConfusionMatrix(labels, 'Salary')
Example #7
0
def _accuracy_for_tree(tree, class_index, test_set):
    labels = decision_tree.classify_with_tree(tree, class_index, test_set)
    return classifier_util.accuracy(labels)