Esempio n. 1
0
        return(setup_database(options.drop))

    if not options.model:
        print >> sys.stdout, 'You must specify a model file for all phases of the algorithm.\n\n'
        optparser.print_usage()
        sys.exit(1)

    model = {}
    if options.training:
        files = [line.rstrip() for line in open(options.training).readlines()]
        if os.path.exists('training.dat'):
            hash = pickle.load(open('training.dat','r'))
            if hash == files and os.path.exists(options.model):
                return 0
        training_data = [open(file).read() for file in files if os.path.exists(file)]
        model = library.create_model(training_data)
        pickle.dump(model, open(options.model,'w'))
        pickle.dump(files, open('training.dat', 'w'))

    try:
        model = pickle.load(open(options.model))
    except:
        print >> sys.stdout, 'You must specify a training file to create a model.\n\n'
        optparser.print_usage()
        sys.exit(1)

    if options.version:
        print "Source hash: %s" % hex(abs(hash(open(sys.argv[0]).read())))
        print 'Model hash: %s' % (model['id'])
    
    if options.analyze_from_file:
Esempio n. 2
0
def test2():
    """
    This test determines if the naive Bayes classifier can correctly classify
    data that it has not seen before.

    This test will fail if our algorithm for smoothing tokens that we have
    not seen yet is broken. Since 'C' is the trigger for the start of a
    statement the probability of 'C unknown' should be very high.

    Returns a tuple giving the number of test passed, the number of
    tests failed and a string holding all log messages, i.e. 
        (3, 4, 'Failed Test[1]: Divide by 0.').
    """
    
    test = 2
    trainingdata = [
        '''A B <s>C D E F</s> G H I J''',
        '''G H <s>C E F D</s> A B I J''',
        '''I J <s>C F D E</s> G H A B''',
        '''a b <s>C D E f</s> I J G H''',
        ]
    testdata = [
        ['r', 'x', 'x', 'x', 'E', 'F', 'G', 'H', 'I', 'J'],
        ['r', 'x', 'x', 'y', 'F', 'D', 'A', 'B', 'I', 'J'],
        ['I', 'J', 'C', 'D', 'z', 'E', 'G', 'H', 'A', 'B'],
        ['a', 'b', 'C', 'D', 'E', 'w', 'I', 'J', 'G', 'H'],
        ]
    correct_output = [
            ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
            ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
            ['O', 'O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O'],
            ['O', 'O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O'],
        ]
    model = library.create_model(trainingdata)
    
    n = len(testdata)
    passed = 0
    log = "Test%d started at %s.\n" % (test, time.ctime())
    for i in range(n):
        try:
            out = library.label_nb(model['P(F|C)'], testdata[i])
            if len(out) != len(correct_output[i]):
                log += "Test%d [%d] Failed.\n" % (tet, i)
                log += "\tOutput from library.label_nb() was the incorrect length.\n"
                log += "\tGot '%s' instead of '%s'.\n" % (str(out), 
                        str(correct_output[i]))
                continue
            if out != correct_output[i]:
                log += "Test%d [%d] Failed.\n" % (test, i)
                log += "\tOutput from library.label_nb() was incorrect.\n"
                log += "\tGot '%s' instead of '%s'.\n" % (str(out), 
                        str(correct_output[i]))
                continue
        except Exception, e:
            log += "Test%d [%d] Failed.\n" % (test, i)
            log += "\tRecieved the following exception:\n"
            exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
            p = '\t'.join(traceback.format_exception(exceptionType, 
                exceptionValue, exceptionTraceback))
            log += "\t%s\n" % p
            continue
        passed += 1
Esempio n. 3
0
def test2():
    """
    This test determines if the naive Bayes classifier can correctly classify
    data that it has not seen before.

    This test will fail if our algorithm for smoothing tokens that we have
    not seen yet is broken. Since 'C' is the trigger for the start of a
    statement the probability of 'C unknown' should be very high.

    Returns a tuple giving the number of test passed, the number of
    tests failed and a string holding all log messages, i.e. 
        (3, 4, 'Failed Test[1]: Divide by 0.').
    """

    test = 2
    trainingdata = [
        '''A B <s>C D E F</s> G H I J''',
        '''G H <s>C E F D</s> A B I J''',
        '''I J <s>C F D E</s> G H A B''',
        '''a b <s>C D E f</s> I J G H''',
    ]
    testdata = [
        ['r', 'x', 'x', 'x', 'E', 'F', 'G', 'H', 'I', 'J'],
        ['r', 'x', 'x', 'y', 'F', 'D', 'A', 'B', 'I', 'J'],
        ['I', 'J', 'C', 'D', 'z', 'E', 'G', 'H', 'A', 'B'],
        ['a', 'b', 'C', 'D', 'E', 'w', 'I', 'J', 'G', 'H'],
    ]
    correct_output = [
        ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
        ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
        ['O', 'O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O'],
        ['O', 'O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O'],
    ]
    model = library.create_model(trainingdata)

    n = len(testdata)
    passed = 0
    log = "Test%d started at %s.\n" % (test, time.ctime())
    for i in range(n):
        try:
            out = library.label_nb(model['P(F|C)'], testdata[i])
            if len(out) != len(correct_output[i]):
                log += "Test%d [%d] Failed.\n" % (tet, i)
                log += "\tOutput from library.label_nb() was the incorrect length.\n"
                log += "\tGot '%s' instead of '%s'.\n" % (
                    str(out), str(correct_output[i]))
                continue
            if out != correct_output[i]:
                log += "Test%d [%d] Failed.\n" % (test, i)
                log += "\tOutput from library.label_nb() was incorrect.\n"
                log += "\tGot '%s' instead of '%s'.\n" % (
                    str(out), str(correct_output[i]))
                continue
        except Exception, e:
            log += "Test%d [%d] Failed.\n" % (test, i)
            log += "\tRecieved the following exception:\n"
            exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
            p = '\t'.join(
                traceback.format_exception(exceptionType, exceptionValue,
                                           exceptionTraceback))
            log += "\t%s\n" % p
            continue
        passed += 1
Esempio n. 4
0
def test1():
    """
    This test determines if the naive Bayes classifier can correctly classify
    data that it has already seen.

    Please note that naive Bayes is a linear classifier so there may be data
    that it will not be able to correctly classify even though it has been
    trained on the same data.

    Returns a tuple giving the number of test passed, the number of
    tests failed and a string holding all log messages, i.e. 
        (3, 4, 'Failed Test [1]: Divide by 0.').
    """

    test = 1
    trainingdata = [
        '''A B <s>C D E F</s> G H I J''',
        '''G H C E F D A B I J''',
        '''I J C F D E G H A B''',
        '''a b <s>C D E f</s> I J G H''',
        ]
    testdata = [
        ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
        ['G', 'H', 'C', 'E', 'F', 'D', 'A', 'B', 'I', 'J'],
        ['I', 'J', 'C', 'F', 'D', 'E', 'G', 'H', 'A', 'B'],
        ['a', 'b', 'C', 'D', 'E', 'f', 'I', 'J', 'G', 'H'],
        ]
    correct_output = [
            ['O', 'O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O'],
            ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
            ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
            ['O', 'O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O'],
        ]
    model = library.create_model(trainingdata)
    
    n = len(testdata)
    passed = 0
    log = "Test%d started at %s.\n" % (test, time.ctime())
    for i in range(n):
        try:
            out = library.label_nb(model['P(F|C)'], testdata[i])
            if len(out) != len(correct_output[i]):
                log += "Test%d [%d] Failed.\n" % (tet, i)
                log += "\tOutput from library.label_nb() was the incorrect length.\n"
                log += "\tGot '%s' instead of '%s'.\n" % (str(out), 
                        str(correct_output[i]))
                continue
            if out != correct_output[i]:
                log += "Test%d [%d] Failed.\n" % (test, i)
                log += "\tOutput from library.label_nb() was incorrect.\n"
                log += "\tGot '%s' instead of '%s'.\n" % (str(out), 
                        str(correct_output[i]))
                continue
        except Exception, e:
            log += "Test%d [%d] Failed.\n" % (test, i)
            log += "\tRecieved the following exception:\n"
            exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
            p = '\t'.join(traceback.format_exception(exceptionType, 
                exceptionValue, exceptionTraceback))
            log += "\t%s\n" % p
            continue
        passed += 1
Esempio n. 5
0
def test1():
    """
    This test determines if the naive Bayes classifier can correctly classify
    data that it has already seen.

    Please note that naive Bayes is a linear classifier so there may be data
    that it will not be able to correctly classify even though it has been
    trained on the same data.

    Returns a tuple giving the number of test passed, the number of
    tests failed and a string holding all log messages, i.e. 
        (3, 4, 'Failed Test [1]: Divide by 0.').
    """

    test = 1
    trainingdata = [
        '''A B <s>C D E F</s> G H I J''',
        '''G H C E F D A B I J''',
        '''I J C F D E G H A B''',
        '''a b <s>C D E f</s> I J G H''',
    ]
    testdata = [
        ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
        ['G', 'H', 'C', 'E', 'F', 'D', 'A', 'B', 'I', 'J'],
        ['I', 'J', 'C', 'F', 'D', 'E', 'G', 'H', 'A', 'B'],
        ['a', 'b', 'C', 'D', 'E', 'f', 'I', 'J', 'G', 'H'],
    ]
    correct_output = [
        ['O', 'O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O'],
        ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
        ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
        ['O', 'O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O'],
    ]
    model = library.create_model(trainingdata)

    n = len(testdata)
    passed = 0
    log = "Test%d started at %s.\n" % (test, time.ctime())
    for i in range(n):
        try:
            out = library.label_nb(model['P(F|C)'], testdata[i])
            if len(out) != len(correct_output[i]):
                log += "Test%d [%d] Failed.\n" % (tet, i)
                log += "\tOutput from library.label_nb() was the incorrect length.\n"
                log += "\tGot '%s' instead of '%s'.\n" % (
                    str(out), str(correct_output[i]))
                continue
            if out != correct_output[i]:
                log += "Test%d [%d] Failed.\n" % (test, i)
                log += "\tOutput from library.label_nb() was incorrect.\n"
                log += "\tGot '%s' instead of '%s'.\n" % (
                    str(out), str(correct_output[i]))
                continue
        except Exception, e:
            log += "Test%d [%d] Failed.\n" % (test, i)
            log += "\tRecieved the following exception:\n"
            exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
            p = '\t'.join(
                traceback.format_exception(exceptionType, exceptionValue,
                                           exceptionTraceback))
            log += "\t%s\n" % p
            continue
        passed += 1