Ejemplo n.º 1
0
def train_and_test(train_file, test_file, smooth, out_file):
    sentences = None
    with open(train_file) as f:
        train_sentences = pos_file_parser(f)
    if not train_sentences:
        exit('error parsing sentences')

    # Train model
    train_tags = get_tags_from_sentences(train_sentences)

    n = 3

    lang_mod = LanguageModel(train_tags, n, smooth)
    lexi_mod = LexicalModel(train_sentences, train_tags, smooth)

    with open(test_file) as f:
        test_sentences = pos_file_parser(f)
    if not test_sentences:
        exit('error parsing sentences')

    # Remove sentences longer than 15
    test_sentences = [s for s in test_sentences if len(s) < 16]

    test_tags = get_tags_from_sentences(test_sentences)
    test_words = get_words_from_sentences(test_sentences)

    # predict from test set and write to file
    with open(out_file, 'w') as f:
        predicted_tags = []
        for s in test_words:
            pt = viterbi(s, lang_mod, lexi_mod)
            #print pt
            f.write(' '.join(s[:-1]) + '\n')
            f.write(' '.join(pt) + '\n')
            #f.flush()
            predicted_tags.append(pt)

        print "Done with viterbi"

        correct = 0
        total = 0

        if len(test_words) > 1 and len(predicted_tags) > 1:
            for words, predicted, tags in zip(test_words, predicted_tags,
                                              test_tags):
                total = total + len(tags)

                for tag, pred in zip(tags, predicted):
                    correct = correct + (pred == tag)
        else:
            # If we're testing only one sentence
            f.write(str(test_words[0]) + '\n')
            f.write(str(predicted_tags[0]) + '\n')
            total = total + len(predicted_tags[0])
            for pred, tag in zip(predicted_tags[0], test_tags[0]):
                correct = correct + (pred == tag)

    print 'Accuracy: ' + str(round(float(correct) / total * 10000) / 100) + '%'
Ejemplo n.º 2
0
def viterbi_test_run(smooth=False):
    with open('data/s4/WSJ02-21.pos') as f:
        sentences = pos_file_parser(f)
    if not sentences:
        exit('error parsing sentences')
    tags = get_tags_from_sentences(sentences)

    n = 3

    lang_mod = LanguageModel(tags, n, smooth)
    lexi_mod = LexicalModel(sentences, tags, smooth)

    path = viterbi('New York is in trouble'.split(), lang_mod, lexi_mod)
    print 'New York is in trouble: ' + str(path)

    path = viterbi('New York is in KAUDERWELSCH'.split(), lang_mod, lexi_mod)
    print 'New York is in KAUDERWELSCH: ' + str(path)
Ejemplo n.º 3
0
def test(smoothe):
    with open('data/s4/simple.pos') as f:
        sentences = pos_file_parser(f)
    if not sentences:
        exit('error parsing sentences')

    tags = get_tags_from_sentences(sentences)

    n = 3
    lang_mod = LanguageModel(tags, n, smoothe)
    lexi_mod = LexicalModel(sentences, tags, smoothe)
    try:

        print 'test language model on simple.pos without smoothing:'

        assert lang_mod.cond_prob(['START', 'DT',
                                   'NN']) == 1.0, 'test 1 failed'
        print 'test 1 passed'

        assert lang_mod.cond_prob(['JJ', 'NN', 'NNS'
                                   ]) == 0.3333333333333333, 'test 2 failed'
        print 'test 2 passed'

        assert lang_mod.cond_prob(['NN', 'IN', 'DT']) == 0.5, 'test 3 failed'
        print 'test 3 passed'

        print 'test lexical model on simple.pos without smoothing:'

        assert lexi_mod.cond_prob(['firm', 'NN']) == 0.0, 'test 1 failed'
        print 'test 1 passed'

        assert lexi_mod.cond_prob(['investment', 'NN'
                                   ]) == 0.18181818181818182, 'test 2 failed'
        print 'test 2 passed'

        assert lexi_mod.cond_prob(['Davis\\Zweig', 'NNP'
                                   ]) == 0.16666666666666666, 'test 3 failed'
        print 'test 3 passed'

        print 'test transition probabilities on simple.pos without smoothing:'
        n = lang_mod.next_n_min1_grams(['START', 'DT'])
        assert n == [(('DT', 'NN'), 1.0)], 'test 1 failed'
        print 'test 1 passed'

        n = lang_mod.next_n_min1_grams(['DT', 'NN'])
        assert n == [(('NN', 'MD'), 0.3333333333333333),
                     (('NN', 'JJ'), 0.3333333333333333),
                     (('NN', 'IN'), 0.3333333333333333)], "test 2 failed"
        print 'test 2 passed'

        print 'test emission probabilities on simple.pos without smoothing:'
        assert lexi_mod.emission_prob(
            'A', ['START', 'DT']) == 0.16666666666666666, 'test 1 failed'
        print 'test 1 passed'

        assert lexi_mod.emission_prob(
            'of', ['NN', 'IN']) == 0.6666666666666666, 'test 2 failed'
        print 'test 2 passed'

        assert lexi_mod.emission_prob(
            'on', ['NN', 'IN']) == 0.3333333333333333, 'test 3 failed'
        print 'test 3 passed'

        assert lang_mod.get_start_probabilites() == [(('START', 'NNPX'), 0.5),
                                                     (('START', 'DT'), 0.5)
                                                     ], 'test 4 failed'
        print 'test 4 passed'

        print 'ALL TESTS PASSED'
    except AssertionError as e:
        print e

    return (lang_mod, lexi_mod)