def _get_pretrain_model():
    pos_tagger = PerceptronTagger(load=False)
    pos_tagger.train(sentences=train_sents, save_loc=PICKLE)
    print('Accuracy : ', pos_tagger.evaluate(test_sents))
Esempio n. 2
0
        ret, tags, err = pipe_through_prog(prog, ' '.join(tokens))
        return tags

    def tag_sents(self, sentences):
        text = []
        for s in sentences:
            text.append(' '.join(s))
        return self.tag(text)

    def evaluate(self, gold):
        tagged_sents = self.tag_sents(untag(sent) for sent in gold)
        gold_tokens = list(itertools.chain(*gold))
        return accuracy(gold_tokens, tagged_sents)


if __name__ == '__main__':
    sents = treebank.tagged_sents()
    PT = PerceptronTagger()
    now = time.time()
    PT.tag_sents(untag(sent) for sent in sents)
    pt_time = time.time() - now
    headers = ['Library', 'Accuracy', 'Time (sec)']
    table = [['NLTK', round(PT.evaluate(sents), 3),
              round(pt_time, 3)],
             [
                 'Prose',
                 round(APTagger().evaluate(sents), 3),
                 round(AP_TIME, 3)
             ]]
    print(tabulate(table, headers, tablefmt='pipe'))
Esempio n. 3
0
        gold_tokens = list(itertools.chain(*gold))
        print(json.dumps(gold_tokens))
        print(len(tagged_sents), len(gold_tokens))
        return accuracy(gold_tokens, tagged_sents)


if __name__ == '__main__':
    sents = treebank.tagged_sents()
    PT = PerceptronTagger()

    print("Timing NLTK ...")
    pt_times = []
    for _ in range(5):
        now = time.time()
        PT.tag_sents(untag(sent) for sent in sents)
        pt_times.append(time.time() - now)
    pt_time = round(sum(pt_times) / len(pt_times), 3)
    '''NOTE: Moved to tag_test.go
    print("Timing prose ...")
    acc = round(APTagger().evaluate(sents), 3)
    ap_time = round(sum(AP_TIME) / len(AP_TIME), 3)
    '''

    print("Evaluating accuracy ...")
    headers = ['Library', 'Accuracy', '5-Run Average (sec)']
    table = [
        ['NLTK', round(PT.evaluate(sents), 3), pt_time],
        # ['`prose`', acc, ap_time]
    ]
    print(tabulate(table, headers, tablefmt='pipe'))
import cowparser as cp

train_sents = []
test_sents = []

gen = cp.sentences_for_dir(separate=False)
for i, (metadata, data) in enumerate(gen):
    train_sents.append([(a,b) for a,b,c in data])
    if i == 2000000:
        break

for i, (metadata, data) in enumerate(gen):
    test_sents.append([(a,b) for a,b,c in data])
    if i == 5000:
        break

from nltk.tag.perceptron import PerceptronTagger
pt = PerceptronTagger(load=False)
pt.train(train_sents,'model2.perc.dutch_tagger')
print(pt.evaluate(test_sents))