Beispiel #1
0
def train_taggers():
    train_sents = load_pkl('train_sents')

    # instantiate taggers
    unigram_tagger = nltk.UnigramTagger(train_sents)
    tnt_tagger = tnt.TnT()
    perceptron_tagger = perceptron.PerceptronTagger(load=False)
    # limit the number of iteractions as the training takes too long
    crf_tagger = CRFTagger(training_opt={'max_iterations': 100})

    print('Unigram tagger has already been trained')
    save_pkl(unigram_tagger, 'unigram-tagger')

    print('training TnT tagger ...', end='', flush=True)
    tnt_tagger.train(train_sents)
    print('Done')
    save_pkl(tnt_tagger, 'tnt-tagger')

    print('training Perceptron tagger ...', end='', flush=True)
    perceptron_tagger.train(train_sents)
    print('Done')
    save_pkl(perceptron_tagger, 'perceptron-tagger')

    print('training CRF tagger ...', end='', flush=True)
    crf_tagger.train(train_sents, 'crf-tagger.model')
    print('Done')
Beispiel #2
0
# coding: utf-8

# In[ ]:

import os
from glob import glob
from zipfile import ZipFile
from collections import Counter
import pickle
import re
from numpy import prod
from collections import Counter
from nltk import sent_tokenize, ngrams, PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.tag import perceptron
tagger = perceptron.PerceptronTagger()
porter = PorterStemmer()
snowball = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()


def word_normalize(word, stemmer=None):
    w = word.lower()
    if stemmer == 'porter':
        w = porter.stem(w)
    elif stemmer == 'snowball':
        w = snowball.stem(w)
    elif stemmer == 'lemma':
        w = lemmatizer.lemmatize(w)
    return w