Beispiel #1
0
def test2():
    """
    This test determines if the naive Bayes classifier can correctly classify
    data that it has not seen before.

    This test will fail if our algorithm for smoothing tokens that we have
    not seen yet is broken. Since 'C' is the trigger for the start of a
    statement the probability of 'C unknown' should be very high.

    Returns a tuple giving the number of test passed, the number of
    tests failed and a string holding all log messages, i.e. 
        (3, 4, 'Failed Test[1]: Divide by 0.').
    """
    
    test = 2
    trainingdata = [
        '''A B <s>C D E F</s> G H I J''',
        '''G H <s>C E F D</s> A B I J''',
        '''I J <s>C F D E</s> G H A B''',
        '''a b <s>C D E f</s> I J G H''',
        ]
    testdata = [
        ['r', 'x', 'x', 'x', 'E', 'F', 'G', 'H', 'I', 'J'],
        ['r', 'x', 'x', 'y', 'F', 'D', 'A', 'B', 'I', 'J'],
        ['I', 'J', 'C', 'D', 'z', 'E', 'G', 'H', 'A', 'B'],
        ['a', 'b', 'C', 'D', 'E', 'w', 'I', 'J', 'G', 'H'],
        ]
    correct_output = [
            ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
            ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
            ['O', 'O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O'],
            ['O', 'O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O'],
        ]
    model = library.create_model(trainingdata)
    
    n = len(testdata)
    passed = 0
    log = "Test%d started at %s.\n" % (test, time.ctime())
    for i in range(n):
        try:
            out = library.label_nb(model['P(F|C)'], testdata[i])
            if len(out) != len(correct_output[i]):
                log += "Test%d [%d] Failed.\n" % (tet, i)
                log += "\tOutput from library.label_nb() was the incorrect length.\n"
                log += "\tGot '%s' instead of '%s'.\n" % (str(out), 
                        str(correct_output[i]))
                continue
            if out != correct_output[i]:
                log += "Test%d [%d] Failed.\n" % (test, i)
                log += "\tOutput from library.label_nb() was incorrect.\n"
                log += "\tGot '%s' instead of '%s'.\n" % (str(out), 
                        str(correct_output[i]))
                continue
        except Exception, e:
            log += "Test%d [%d] Failed.\n" % (test, i)
            log += "\tRecieved the following exception:\n"
            exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
            p = '\t'.join(traceback.format_exception(exceptionType, 
                exceptionValue, exceptionTraceback))
            log += "\t%s\n" % p
            continue
        passed += 1
Beispiel #2
0
def test1():
    """
    This test determines if the naive Bayes classifier can correctly classify
    data that it has already seen.

    Please note that naive Bayes is a linear classifier so there may be data
    that it will not be able to correctly classify even though it has been
    trained on the same data.

    Returns a tuple giving the number of test passed, the number of
    tests failed and a string holding all log messages, i.e. 
        (3, 4, 'Failed Test [1]: Divide by 0.').
    """

    test = 1
    trainingdata = [
        '''A B <s>C D E F</s> G H I J''',
        '''G H C E F D A B I J''',
        '''I J C F D E G H A B''',
        '''a b <s>C D E f</s> I J G H''',
        ]
    testdata = [
        ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
        ['G', 'H', 'C', 'E', 'F', 'D', 'A', 'B', 'I', 'J'],
        ['I', 'J', 'C', 'F', 'D', 'E', 'G', 'H', 'A', 'B'],
        ['a', 'b', 'C', 'D', 'E', 'f', 'I', 'J', 'G', 'H'],
        ]
    correct_output = [
            ['O', 'O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O'],
            ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
            ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
            ['O', 'O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O'],
        ]
    model = library.create_model(trainingdata)
    
    n = len(testdata)
    passed = 0
    log = "Test%d started at %s.\n" % (test, time.ctime())
    for i in range(n):
        try:
            out = library.label_nb(model['P(F|C)'], testdata[i])
            if len(out) != len(correct_output[i]):
                log += "Test%d [%d] Failed.\n" % (tet, i)
                log += "\tOutput from library.label_nb() was the incorrect length.\n"
                log += "\tGot '%s' instead of '%s'.\n" % (str(out), 
                        str(correct_output[i]))
                continue
            if out != correct_output[i]:
                log += "Test%d [%d] Failed.\n" % (test, i)
                log += "\tOutput from library.label_nb() was incorrect.\n"
                log += "\tGot '%s' instead of '%s'.\n" % (str(out), 
                        str(correct_output[i]))
                continue
        except Exception, e:
            log += "Test%d [%d] Failed.\n" % (test, i)
            log += "\tRecieved the following exception:\n"
            exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
            p = '\t'.join(traceback.format_exception(exceptionType, 
                exceptionValue, exceptionTraceback))
            log += "\t%s\n" % p
            continue
        passed += 1
Beispiel #3
0
def crossvalidation(data_file, folds=10):
    """
    Performs cross validation on set of data and returns the results.

    This is a randomized algorithms so you should NOT use if to 
    regression testing.

    data_file should be a file with each line containing a triple quoted 
    canonical string representation, i.e. call repr on the string.
    The text to be hi-lighted should be wrapped in <s>...</s> tags.

    Outputs a dictionary that holds specific statistics about the performance
    of the classifier.
    """
    
    if folds <= 1:
        raise Exception("Number of folds is too small. A value greater than 1 is required.")

    training_data = [eval(line) for line in open(data_file).readlines()]
    N = len(training_data)
    if N < folds:
        raise Exception("Number of folds is greater than number of data points.")

    # The approximate number of data items in each fold
    n = int(round(N/float(folds)))

    # shuffle the training data so we dont have any funky correlation issues
    # with its ordering.
    random.shuffle(training_data)

    parsed_data = [library.parsetext(text) for text in training_data]
    tokens = [[parsed_data[i][j][0] for j in xrange(len(parsed_data[i]))] for i in xrange(N)]
    bio_data = [library.tokens_to_BIO(tokens[i]) for i in xrange(N)]

    fold_index = []
    for i in range(folds):
        fold_index.append(range(i*n,min([n+n*i,N])))

    accuracy = 0.0
    classes = ['B', 'I', 'O']
    matrix = {'B':{'B':0.0, 'I':0.0, 'O':0.0}, 'I':{'B':0.0, 'I':0.0, 'O':0.0}, 'O':{'B':0.0, 'I':0.0, 'O':0.0}}

    for i in range(folds):
        print "Fold %d." % i
        testing = fold_index[i]
        training = list(set(testing).symmetric_difference(set(range(N))))

        testing_data = [bio_data[d] for d in testing]
        training_data = [bio_data[d] for d in training]

        PFC = library.tuned_model(training_data)

        passed = 0
        for test in testing_data:
            tokens = test[0]
            labels = test[1]

            out = library.label_nb(PFC, tokens)

            if out == labels:
                passed += 1

            for l in range(len(labels)):
                matrix[labels[l]][out[l]] += 1.0

        accuracy += passed/float(len(testing))

    raw_accuracy = accuracy/float(folds)

    total = sum([sum([matrix[c1][c2] for c2 in classes]) for c1 in classes])
    for c1 in classes:
        s = sum([matrix[c1][c2] for c2 in classes])
        for c2 in matrix[c1]:
            matrix[c1][c2] = matrix[c1][c2]/s

    print "Raw Accuracy: %1.2f" % raw_accuracy
    print "------------------------"
    print "   |   B  |   I  |   O  "
    print "------------------------"
    print " B | %1.2f | %1.2f | %1.2f " % (matrix['B']['B'], matrix['B']['I'], matrix['B']['O'])
    print "------------------------"
    print " I | %1.2f | %1.2f | %1.2f " % (matrix['I']['B'], matrix['I']['I'], matrix['I']['O'])
    print "------------------------"
    print " O | %1.2f | %1.2f | %1.2f " % (matrix['O']['B'], matrix['O']['I'], matrix['O']['O'])
    print "------------------------"

    return {'raw accuracy':raw_accuracy, 'matrix':matrix}
Beispiel #4
0
def test2():
    """
    This test determines if the naive Bayes classifier can correctly classify
    data that it has not seen before.

    This test will fail if our algorithm for smoothing tokens that we have
    not seen yet is broken. Since 'C' is the trigger for the start of a
    statement the probability of 'C unknown' should be very high.

    Returns a tuple giving the number of test passed, the number of
    tests failed and a string holding all log messages, i.e. 
        (3, 4, 'Failed Test[1]: Divide by 0.').
    """

    test = 2
    trainingdata = [
        '''A B <s>C D E F</s> G H I J''',
        '''G H <s>C E F D</s> A B I J''',
        '''I J <s>C F D E</s> G H A B''',
        '''a b <s>C D E f</s> I J G H''',
    ]
    testdata = [
        ['r', 'x', 'x', 'x', 'E', 'F', 'G', 'H', 'I', 'J'],
        ['r', 'x', 'x', 'y', 'F', 'D', 'A', 'B', 'I', 'J'],
        ['I', 'J', 'C', 'D', 'z', 'E', 'G', 'H', 'A', 'B'],
        ['a', 'b', 'C', 'D', 'E', 'w', 'I', 'J', 'G', 'H'],
    ]
    correct_output = [
        ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
        ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
        ['O', 'O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O'],
        ['O', 'O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O'],
    ]
    model = library.create_model(trainingdata)

    n = len(testdata)
    passed = 0
    log = "Test%d started at %s.\n" % (test, time.ctime())
    for i in range(n):
        try:
            out = library.label_nb(model['P(F|C)'], testdata[i])
            if len(out) != len(correct_output[i]):
                log += "Test%d [%d] Failed.\n" % (tet, i)
                log += "\tOutput from library.label_nb() was the incorrect length.\n"
                log += "\tGot '%s' instead of '%s'.\n" % (
                    str(out), str(correct_output[i]))
                continue
            if out != correct_output[i]:
                log += "Test%d [%d] Failed.\n" % (test, i)
                log += "\tOutput from library.label_nb() was incorrect.\n"
                log += "\tGot '%s' instead of '%s'.\n" % (
                    str(out), str(correct_output[i]))
                continue
        except Exception, e:
            log += "Test%d [%d] Failed.\n" % (test, i)
            log += "\tRecieved the following exception:\n"
            exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
            p = '\t'.join(
                traceback.format_exception(exceptionType, exceptionValue,
                                           exceptionTraceback))
            log += "\t%s\n" % p
            continue
        passed += 1
Beispiel #5
0
def test1():
    """
    This test determines if the naive Bayes classifier can correctly classify
    data that it has already seen.

    Please note that naive Bayes is a linear classifier so there may be data
    that it will not be able to correctly classify even though it has been
    trained on the same data.

    Returns a tuple giving the number of test passed, the number of
    tests failed and a string holding all log messages, i.e. 
        (3, 4, 'Failed Test [1]: Divide by 0.').
    """

    test = 1
    trainingdata = [
        '''A B <s>C D E F</s> G H I J''',
        '''G H C E F D A B I J''',
        '''I J C F D E G H A B''',
        '''a b <s>C D E f</s> I J G H''',
    ]
    testdata = [
        ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
        ['G', 'H', 'C', 'E', 'F', 'D', 'A', 'B', 'I', 'J'],
        ['I', 'J', 'C', 'F', 'D', 'E', 'G', 'H', 'A', 'B'],
        ['a', 'b', 'C', 'D', 'E', 'f', 'I', 'J', 'G', 'H'],
    ]
    correct_output = [
        ['O', 'O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O'],
        ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
        ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
        ['O', 'O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O'],
    ]
    model = library.create_model(trainingdata)

    n = len(testdata)
    passed = 0
    log = "Test%d started at %s.\n" % (test, time.ctime())
    for i in range(n):
        try:
            out = library.label_nb(model['P(F|C)'], testdata[i])
            if len(out) != len(correct_output[i]):
                log += "Test%d [%d] Failed.\n" % (tet, i)
                log += "\tOutput from library.label_nb() was the incorrect length.\n"
                log += "\tGot '%s' instead of '%s'.\n" % (
                    str(out), str(correct_output[i]))
                continue
            if out != correct_output[i]:
                log += "Test%d [%d] Failed.\n" % (test, i)
                log += "\tOutput from library.label_nb() was incorrect.\n"
                log += "\tGot '%s' instead of '%s'.\n" % (
                    str(out), str(correct_output[i]))
                continue
        except Exception, e:
            log += "Test%d [%d] Failed.\n" % (test, i)
            log += "\tRecieved the following exception:\n"
            exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
            p = '\t'.join(
                traceback.format_exception(exceptionType, exceptionValue,
                                           exceptionTraceback))
            log += "\t%s\n" % p
            continue
        passed += 1
Beispiel #6
0
def crossvalidation(data_file, folds=10):
    """
    Performs cross validation on set of data and returns the results.

    This is a randomized algorithms so you should NOT use if to 
    regression testing.

    data_file should be a file with each line containing a triple quoted 
    canonical string representation, i.e. call repr on the string.
    The text to be hi-lighted should be wrapped in <s>...</s> tags.

    Outputs a dictionary that holds specific statistics about the performance
    of the classifier.
    """

    if folds <= 1:
        raise Exception(
            "Number of folds is too small. A value greater than 1 is required."
        )

    training_data = [eval(line) for line in open(data_file).readlines()]
    N = len(training_data)
    if N < folds:
        raise Exception(
            "Number of folds is greater than number of data points.")

    # The approximate number of data items in each fold
    n = int(round(N / float(folds)))

    # shuffle the training data so we dont have any funky correlation issues
    # with its ordering.
    random.shuffle(training_data)

    parsed_data = [library.parsetext(text) for text in training_data]
    tokens = [[parsed_data[i][j][0] for j in xrange(len(parsed_data[i]))]
              for i in xrange(N)]
    bio_data = [library.tokens_to_BIO(tokens[i]) for i in xrange(N)]

    fold_index = []
    for i in range(folds):
        fold_index.append(range(i * n, min([n + n * i, N])))

    accuracy = 0.0
    classes = ['B', 'I', 'O']
    matrix = {
        'B': {
            'B': 0.0,
            'I': 0.0,
            'O': 0.0
        },
        'I': {
            'B': 0.0,
            'I': 0.0,
            'O': 0.0
        },
        'O': {
            'B': 0.0,
            'I': 0.0,
            'O': 0.0
        }
    }

    for i in range(folds):
        print "Fold %d." % i
        testing = fold_index[i]
        training = list(set(testing).symmetric_difference(set(range(N))))

        testing_data = [bio_data[d] for d in testing]
        training_data = [bio_data[d] for d in training]

        PFC = library.tuned_model(training_data)

        passed = 0
        for test in testing_data:
            tokens = test[0]
            labels = test[1]

            out = library.label_nb(PFC, tokens)

            if out == labels:
                passed += 1

            for l in range(len(labels)):
                matrix[labels[l]][out[l]] += 1.0

        accuracy += passed / float(len(testing))

    raw_accuracy = accuracy / float(folds)

    total = sum([sum([matrix[c1][c2] for c2 in classes]) for c1 in classes])
    for c1 in classes:
        s = sum([matrix[c1][c2] for c2 in classes])
        for c2 in matrix[c1]:
            matrix[c1][c2] = matrix[c1][c2] / s

    print "Raw Accuracy: %1.2f" % raw_accuracy
    print "------------------------"
    print "   |   B  |   I  |   O  "
    print "------------------------"
    print " B | %1.2f | %1.2f | %1.2f " % (matrix['B']['B'], matrix['B']['I'],
                                           matrix['B']['O'])
    print "------------------------"
    print " I | %1.2f | %1.2f | %1.2f " % (matrix['I']['B'], matrix['I']['I'],
                                           matrix['I']['O'])
    print "------------------------"
    print " O | %1.2f | %1.2f | %1.2f " % (matrix['O']['B'], matrix['O']['I'],
                                           matrix['O']['O'])
    print "------------------------"

    return {'raw accuracy': raw_accuracy, 'matrix': matrix}