Esempio n. 1
0
def main(model_dir,
         train_loc,
         dev_loc,
         iters=5,
         n_sents=0,
         feat_thresh=5,
         beam_width=4):
    sent_strs = open(train_loc).read().strip().replace('|', '/').split('\n')
    # Apply limit
    if n_sents != 0:
        sent_strs = sent_strs[:n_sents]
    tagger = redshift.tagger.train('\n'.join(sent_strs),
                                   model_dir,
                                   beam_width=beam_width,
                                   nr_iter=iters,
                                   feat_thresh=feat_thresh)
    dev_input = [
        Input.from_pos(s.replace('|', '/'))
        for s in open(dev_loc).read().strip().split('\n')
    ]
    t = 1e-100
    c = 0
    for sent in dev_input:
        gold_tags = [tok.tag for tok in sent.tokens]
        tagger.tag(sent)
        for i, token in enumerate(sent.tokens):
            c += gold_tags[i] == token.tag
            t += 1
    print c / t
Esempio n. 2
0
def test_tag(tagger, sentence):
    tagger.tag(sentence)
    assert sentence.length == 7
    tokens = list(sentence.tokens)
    assert tokens[0].word == 'This'
    assert tokens[1].word == 'is'
    assert tokens[2].word == 'a'
    assert tokens[3].word == 'test'
    assert tokens[4].word == '.'
Esempio n. 3
0
def test_tag(tagger, sentence):
    tagger.tag(sentence)
    assert sentence.length == 7
    tokens = list(sentence.tokens)
    assert tokens[0].word == 'This'
    assert tokens[1].word == 'is'
    assert tokens[2].word == 'a'
    assert tokens[3].word == 'test'
    assert tokens[4].word == '.'
Esempio n. 4
0
def main(model_dir, train_loc, dev_loc, iters=5, n_sents=0, feat_thresh=5, beam_width=4):
    sent_strs = open(train_loc).read().strip().replace('|', '/').split('\n')
    # Apply limit
    if n_sents != 0:
        sent_strs = sent_strs[:n_sents]
    tagger = redshift.tagger.train('\n'.join(sent_strs), model_dir,
        beam_width=beam_width, nr_iter=iters, feat_thresh=feat_thresh)
    dev_input = [Input.from_pos(s.replace('|', '/'))
                 for s in open(dev_loc).read().strip().split('\n')]
    t = 1e-100
    c = 0
    for sent in dev_input:
        gold_tags = [tok.tag for tok in sent.tokens]
        tagger.tag(sent)
        for i, token in enumerate(sent.tokens):
            c += gold_tags[i] == token.tag
            t += 1
    print c / t