def test_perceptron_id_features(corpus, train_seq, dev_seq, test_seq):

    feature_mapper = idfc.IDFeatures(train_seq)
    feature_mapper.build_features()

    sp = spc.StructuredPerceptron(corpus.word_dict, corpus.tag_dict,
                                  feature_mapper)
    sp.num_epochs = 1
    sp.train_supervised(train_seq)

    pred_train = sp.viterbi_decode_corpus(train_seq)
    pred_dev = sp.viterbi_decode_corpus(dev_seq)
    pred_test = sp.viterbi_decode_corpus(test_seq)
    eval_train = sp.evaluate_corpus(train_seq, pred_train)
    eval_dev = sp.evaluate_corpus(dev_seq, pred_dev)
    eval_test = sp.evaluate_corpus(test_seq, pred_test)
    assert abs(eval_train - 0.7980868285504047) < tolerance
    assert abs(eval_dev - 0.7641866330390921) < tolerance
    assert abs(eval_test - 0.7187039764359352) < tolerance
def test_crf_id_features(corpus, train_seq, dev_seq, test_seq):
    feature_mapper = idfc.IDFeatures(train_seq)
    feature_mapper.build_features()

    crf_online = crfo.CRFOnline(corpus.word_dict, corpus.tag_dict,
                                feature_mapper)
    crf_online.num_epochs = 1
    crf_online.train_supervised(train_seq)

    # pred_train = crf_online.viterbi_decode_corpus(train_seq)
    # eval_train = crf_online.evaluate_corpus(train_seq, pred_train)
    # assert abs(eval_train - 0.8394407652685798) < tolerance

    # pred_dev = crf_online.viterbi_decode_corpus(dev_seq)
    # eval_dev = crf_online.evaluate_corpus(dev_seq, pred_dev)
    # assert abs(eval_dev - 0.7957124842370744) < tolerance

    pred_test = crf_online.viterbi_decode_corpus(test_seq)
    eval_test = crf_online.evaluate_corpus(test_seq, pred_test)
    assert abs(eval_test - 0.7849779086892489) < tolerance
Esempio n. 3
0
import lxmls.sequences.extended_feature as exfc

print "CRF Exercise"

corpus = pcc.PostagCorpus()
train_seq = corpus.read_sequence_list_conll("data/train-02-21.conll",
                                            max_sent_len=10,
                                            max_nr_sent=1000)
test_seq = corpus.read_sequence_list_conll("data/test-23.conll",
                                           max_sent_len=10,
                                           max_nr_sent=1000)
dev_seq = corpus.read_sequence_list_conll("data/dev-22.conll",
                                          max_sent_len=10,
                                          max_nr_sent=1000)

feature_mapper = idfc.IDFeatures(train_seq)
feature_mapper.build_features()

crf_online = crfo.CRFOnline(corpus.word_dict, corpus.tag_dict, feature_mapper)
crf_online.num_epochs = 20
crf_online.train_supervised(train_seq)

pred_train = crf_online.viterbi_decode_corpus(train_seq)
pred_dev = crf_online.viterbi_decode_corpus(dev_seq)
pred_test = crf_online.viterbi_decode_corpus(test_seq)
eval_train = crf_online.evaluate_corpus(train_seq, pred_train)
eval_dev = crf_online.evaluate_corpus(dev_seq, pred_dev)
eval_test = crf_online.evaluate_corpus(test_seq, pred_test)

print "CRF - ID Features Accuracy Train: %.3f Dev: %.3f Test: %.3f" % (
    eval_train, eval_dev, eval_test)