def make_pos_model(model_type): now = time.time() reader = TaggedCorpusReader('.', 'greek_training_set.pos') train_sents = reader.tagged_sents() if model_type == 'unigram': tagger = UnigramTagger(train_sents) file = 'unigram.pickle' elif model_type == 'bigram': tagger = BigramTagger(train_sents) file = 'bigram.pickle' elif model_type == 'trigram': tagger = TrigramTagger(train_sents) file = 'trigram.pickle' elif model_type == 'backoff': tagger1 = UnigramTagger(train_sents) tagger2 = BigramTagger(train_sents, backoff=tagger1) tagger = TrigramTagger(train_sents, backoff=tagger2) file = '123grambackoff.pickle' elif model_type == 'tnt': tagger = tnt.TnT() tagger.train(train_sents) file = 'tnt.pickle' else: print('Invalid model_type.') _dir = os.path.expanduser('~/greek_models_cltk/taggers/pos') path = os.path.join(_dir, file) with open(path, 'wb') as f: pickle.dump(tagger, f) print('Completed training {0} model in {1} seconds to {2}.'.format( model_type, time.time() - now, path))
def make_pos_model(model_type): now = time.time() reader = TaggedCorpusReader('.', 'greek_training_set.pos') train_sents = reader.tagged_sents() if model_type == 'unigram': tagger = UnigramTagger(train_sents) file = 'unigram.pickle' elif model_type == 'bigram': tagger = BigramTagger(train_sents) file = 'bigram.pickle' elif model_type == 'trigram': tagger = TrigramTagger(train_sents) file = 'trigram.pickle' elif model_type == 'backoff': tagger1 = UnigramTagger(train_sents) tagger2 = BigramTagger(train_sents, backoff=tagger1) tagger = TrigramTagger(train_sents, backoff=tagger2) file = '123grambackoff.pickle' elif model_type == 'tnt': tagger = tnt.TnT() tagger.train(train_sents) file = 'tnt.pickle' else: print('Invalid model_type.') _dir = os.path.expanduser('~/greek_models_cltk/taggers/pos') path = os.path.join(_dir, file) with open(path, 'wb') as f: pickle.dump(tagger, f) print('Completed training {0} model in {1} seconds to {2}.'.format(model_type, time.time() - now, path))
def train_tagger(language, model_type, feature, train_sents): if model_type == 'unigram': tagger = UnigramTagger(train_sents) elif model_type == 'bigram': tagger = BigramTagger(train_sents) elif model_type == 'trigram': tagger = TrigramTagger(train_sents) elif model_type == 'backoff': tagger1 = UnigramTagger(train_sents) tagger2 = BigramTagger(train_sents, backoff=tagger1) tagger = TrigramTagger(train_sents, backoff=tagger2) elif model_type == 'crf': tagger = CRFTagger() tagger.train(train_sents, 'taggers/{0}/{1}/crf.pickle'.format(language, feature)) elif model_type == 'perceptron': tagger = PerceptronTagger(load=False) tagger.train(train_sents) return tagger
def contextual_rules(wikicorpus_dir, context_file): sentences = wikicorpus(wikicorpus_dir, words=1000000) ANONYMOUS = "anonymous" for s in sentences: for i, (w, tag) in enumerate(s): if tag == "NP": # NP = proper noun in Parole tagset. s[i] = (ANONYMOUS, "NP") ctx = fntbl37() tagger = UnigramTagger(sentences) tagger = BrillTaggerTrainer(tagger, ctx, trace=0) tagger = tagger.train(sentences, max_rules=100) #print tagger.evaluate(wikicorpus(10000, start=1)) with open(context_file, "w") as f: for rule in tagger.rules(): f.write("%s\n" % rule)
s[i] = (ANONYMOUS, "NP") # We can then train NLTK's FastBrillTaggerTrainer. It is based on a unigram tagger, which is simply a lexicon of known words # and their part-of-speech tag. It will then boost the accuracy with a set of contextual rules that change a word's part-of-speech # tag depending on the surrounding words. from nltk.tag import UnigramTagger from nltk.tag import FastBrillTaggerTrainer from nltk.tag.brill import SymmetricProximateTokensTemplate from nltk.tag.brill import ProximateTokensTemplate from nltk.tag.brill import ProximateTagsRule from nltk.tag.brill import ProximateWordsRule ctx = [ # Context = surrounding words and tags. SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 1)), SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 2)), SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 3)), SymmetricProximateTokensTemplate(ProximateTagsRule, (2, 2)), SymmetricProximateTokensTemplate(ProximateWordsRule, (0, 0)), SymmetricProximateTokensTemplate(ProximateWordsRule, (1, 1)), SymmetricProximateTokensTemplate(ProximateWordsRule, (1, 2)), ProximateTokensTemplate(ProximateTagsRule, (-1, -1), (1, 1)), ] tagger = UnigramTagger(sentences) tagger = FastBrillTaggerTrainer(tagger, ctx, trace=0) tagger = tagger.train(sentences, max_rules=100) #print tagger.evaluate(wikicorpus(10000, start=1))