def __init__(self, **kwargs): super().__init__(**kwargs) self.tagger = kwargs["tagger"] templates = [ brill.Template(brill.Pos([-1])), brill.Template(brill.Pos([1])), brill.Template(brill.Pos([-2])), brill.Template(brill.Pos([2])), brill.Template(brill.Pos([-2, -1])), brill.Template(brill.Pos([1, 2])), brill.Template(brill.Pos([-3, -2, -1])), brill.Template(brill.Pos([1, 2, 3])), brill.Template(brill.Pos([-1]), brill.Pos([1])), brill.Template(brill.Word([-1])), brill.Template(brill.Word([1])), brill.Template(brill.Word([-2])), brill.Template(brill.Word([2])), brill.Template(brill.Word([-2, -1])), brill.Template(brill.Word([1, 2])), brill.Template(brill.Word([-3, -2, -1])), brill.Template(brill.Word([1, 2, 3])), brill.Template(brill.Word([-1]), brill.Word([1])), ] self.brillTrainer = BrillTaggerTrainer(self.tagger, templates, deterministic=True)
def train_brill_tagger(initial_tagger, train_sents, **kwargs): templates = [ brill.Template(brill.Pos([-1])), brill.Template(brill.Pos([1])), brill.Template(brill.Pos([-2])), brill.Template(brill.Pos([2])), brill.Template(brill.Pos([-2, -1])), brill.Template(brill.Pos([1, 2])), brill.Template(brill.Pos([-3, -2, -1])), brill.Template(brill.Pos([1, 2, 3])), brill.Template(brill.Pos([-1]), brill.Pos([1])), brill.Template(brill.Word([-1])), brill.Template(brill.Word([1])), brill.Template(brill.Word([-2])), brill.Template(brill.Word([2])), brill.Template(brill.Word([-2, -1])), brill.Template(brill.Word([1, 2])), brill.Template(brill.Word([-3, -2, -1])), brill.Template(brill.Word([1, 2, 3])), brill.Template(brill.Word([-1]), brill.Word([1])), ] trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True) return trainer.train(train_sents, **kwargs)
def train_brill_tagger(initial_tagger, training, **kwargs): """ Function to train a brill tagger. Uses rules to correct the results of a tagger """ templates = [ brill.Template(brill.Pos([-1])), brill.Template(brill.Pos([1])), brill.Template(brill.Pos([-2])), brill.Template(brill.Pos([2])), brill.Template(brill.Pos([-2, -1])), brill.Template(brill.Pos([1, 2])), brill.Template(brill.Pos([-3, -2, -1])), brill.Template(brill.Pos([1, 2, 3])), brill.Template(brill.Pos([-1]), brill.Pos([1])), brill.Template(brill.Word([-1])), brill.Template(brill.Word([1])), brill.Template(brill.Word([-2])), brill.Template(brill.Word([2])), brill.Template(brill.Word([-2, -1])), brill.Template(brill.Word([1, 2])), brill.Template(brill.Word([-3, -2, -1])), brill.Template(brill.Word([1, 2, 3])), brill.Template(brill.Word([-1]), brill.Word([1])), ] trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True) return trainer.train(training, **kwargs)
def get_brill_tagger(self): train_data = TaggedCorpusReader('.', 'tagged_input_sentences.txt', sep="/") traindata = list(train_data.tagged_sents()) postag = load('taggers/maxent_treebank_pos_tagger/english.pickle') templates = [ brill.Template(brill.Pos([-1])), brill.Template(brill.Pos([1])), brill.Template(brill.Pos([-2])), brill.Template(brill.Pos([2])), brill.Template(brill.Pos([-2, -1])), brill.Template(brill.Pos([1, 2])), brill.Template(brill.Pos([-3, -2, -1])), brill.Template(brill.Pos([1, 2, 3])), brill.Template(brill.Pos([-1]), brill.Pos([1])), brill.Template(brill.Word([-1])), brill.Template(brill.Word([1])), brill.Template(brill.Word([-2])), brill.Template(brill.Word([2])), brill.Template(brill.Word([-2, -1])), brill.Template(brill.Word([1, 2])), brill.Template(brill.Word([-3, -2, -1])), brill.Template(brill.Word([1, 2, 3])), brill.Template(brill.Word([-1]), brill.Word([1])) ] trainer = BrillTaggerTrainer(postag, templates=templates, trace=3) brill_tagger = trainer.train(traindata, max_rules=10) return brill_tagger
def train_brill_tagger(initial_tagger, train_sents, **kwargs): templates = [ brill.Template(brill.Pos([-1])), brill.Template(brill.Pos([1])), brill.Template(brill.Pos([-2])), brill.Template(brill.Pos([2])), brill.Template(brill.Pos([-2, -1])), brill.Template(brill.Pos([1, 2])), brill.Template(brill.Pos([-3, -2, -1])), brill.Template(brill.Pos([1, 2, 3])), brill.Template(brill.Pos([-1]), brill.Pos([1])), brill.Template(brill.Word([-1])), brill.Template(brill.Word([1])), brill.Template(brill.Word([-2])), brill.Template(brill.Word([2])), brill.Template(brill.Word([-2, -1])), brill.Template(brill.Word([1, 2])), brill.Template(brill.Word([-3, -2, -1])), brill.Template(brill.Word([1, 2, 3])), brill.Template(brill.Word([-1]), brill.Word([1])), ] #templates = nltkdemo18() # nltkdemo18plus() # fntbl37() # brill24() trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True, trace=True) return trainer.train(train_sents, **kwargs)
def train(self, data): # baseline tagger: unigram tagger hmm = HMMTagger() hmm.train(data) self.baseline_tagger = hmm.tagger # train brill tagger with HMM as baseline templates = [ brill.Template(brill.Pos([-1])), brill.Template(brill.Pos([1])), brill.Template(brill.Pos([-2])), brill.Template(brill.Pos([2])), brill.Template(brill.Pos([-2, -1])), brill.Template(brill.Pos([1, 2])), brill.Template(brill.Pos([-3, -2, -1])), brill.Template(brill.Pos([1, 2, 3])), brill.Template(brill.Pos([-1]), brill.Pos([1])), brill.Template(brill.Word([-1])), brill.Template(brill.Word([1])), brill.Template(brill.Word([-2])), brill.Template(brill.Word([2])), brill.Template(brill.Word([-2, -1])), brill.Template(brill.Word([1, 2])), brill.Template(brill.Word([-3, -2, -1])), brill.Template(brill.Word([1, 2, 3])), brill.Template(brill.Word([-1]), brill.Word([1])), ] self.trainer = BrillTaggerTrainer(self.baseline_tagger, templates=templates) self.tagger = self.trainer.train(data)
def train_brill_tagger(initial_tagger, train_sents, **kwargs): ''' some suggested rules for the template change the POS of a word, depending on the POS of the previous word change the POS of a word, depending on the POS of any of the two previous words change the POS of a word, depending on the POS of any of the three previous words change the POS of a word, depending on the POS of the previous word and the POS of the next word change the POS of a word, depending on the previous word change the POS of a word, depending on any of the two previous words change the POS of a word, depending on any of the three previous words change the POS of a word, depending on the previous word and the next word ''' # Template generates rule for the Brill Rules that Brill tagger gonna use it templates = [ brill.Template(brill.Pos([-1])), # rule can be generated using the previous POS tag brill.Template(brill.Pos([1])), # look at the next POS tag to generate a rule brill.Template(brill.Pos([-2])), # rule can be generated using the two previous POS tag brill.Template(brill.Pos([2])), # rule can be generated using the next two POS tag brill.Template(brill.Pos([-2, -1])), # look at the combination of the previous two words to learn transformation rule brill.Template(brill.Pos([1, 2])), brill.Template(brill.Pos([-3, -2, -1])), brill.Template(brill.Pos([1, 2, 3])), brill.Template(brill.Pos([-1]), brill.Pos([1])), brill.Template(brill.Word([-1])), brill.Template(brill.Word([1])), brill.Template(brill.Word([-2])), brill.Template(brill.Word([2])), brill.Template(brill.Word([-2, -1])), brill.Template(brill.Word([1, 2])), brill.Template(brill.Word([-3, -2, -1])), brill.Template(brill.Word([1, 2, 3])), brill.Template(brill.Word([-1]), brill.Word([1])), ] ''' BrillTaggerTrainer(1st, 2nd, 3rd, ...) 1st param initial_tagger: (Tagger) the baseline tagger 2nd param templates : (list of templates) templates to be used in training 3rd param trace: (int) verbosity level == information level u want to see 4th param deterministic: (bool) if True, adjudicate ties deterministically 5th ruleformat: (str) format of reported rules ''' trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True, trace=True) return trainer.train(train_sents, max_rules=100, min_score=2)
def test_pos_template(self): train_sents = treebank.tagged_sents()[:1000] tagger = UnigramTagger(train_sents) trainer = brill_trainer.BrillTaggerTrainer( tagger, [brill.Template(brill.Pos([-1]))]) brill_tagger = trainer.train(train_sents) # Example from https://github.com/nltk/nltk/issues/769 result = brill_tagger.tag('This is a foo bar sentence'.split()) expected = [('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('foo', None), ('bar', 'NN'), ('sentence', None)] self.assertEqual(result, expected)
def train_brill_tagger(initial_tagger, train_sents, **kwargs): templates = [ brill.Template(brill.Pos([-1])), #a rule can be generated using the previous part-of-speech tag brill.Template(brill.Pos([1])), brill.Template(brill.Pos([-2])), brill.Template(brill.Pos([2])), brill.Template(brill.Pos([-2, -1])), brill.Template(brill.Pos([1, 2])), brill.Template(brill.Pos([-3, -2, -1])), brill.Template(brill.Pos([1, 2, 3])), brill.Template(brill.Pos([-1]), brill.Pos([1])), brill.Template(brill.Word([-1])), brill.Template(brill.Word([1])), brill.Template(brill.Word([-2])), brill.Template(brill.Word([2])), brill.Template(brill.Word([-2, -1])), #you can look at the combination of the previous two words to learn a transformation rule brill.Template(brill.Word([1, 2])), brill.Template(brill.Word([-3, -2, -1])), brill.Template(brill.Word([1, 2, 3])), brill.Template(brill.Word([-1]), brill.Word([1])), ] trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True) return trainer.train(train_sents, **kwargs)
def test_pos_template(self): train_sents = treebank.tagged_sents()[:1000] tagger = UnigramTagger(train_sents) trainer = brill_trainer.BrillTaggerTrainer( tagger, [brill.Template(brill.Pos([-1]))]) brill_tagger = trainer.train(train_sents) # Example from https://github.com/nltk/nltk/issues/769 result = brill_tagger.tag("This is a foo bar sentence".split()) expected = [ ("This", "DT"), ("is", "VBZ"), ("a", "DT"), ("foo", None), ("bar", "NN"), ("sentence", None), ] self.assertEqual(result, expected)
(r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*ness$', 'NN'), (r'.*ment$', 'NN'), (r'.*ful$', 'JJ'), (r'.*ious$', 'JJ'), (r'.*ble+$', 'JJ'), (r'.*ic$', 'JJ'), (r'.*ive$', 'JJ'), (r'.*ic$', 'JJ'), (r'.*est$', 'JJ'), (r'^a$', 'PREP'), (r'.*', 'NN')] regexp_tagger = nltk.RegexpTagger(patterns) affix_tagger = nltk.AffixTagger(tagged_posts, backoff=regexp_tagger) unigram_tagger = nltk.UnigramTagger(tagged_posts, backoff=affix_tagger) bigram_tagger = nltk.BigramTagger(tagged_posts, backoff=unigram_tagger) tbuar_tagger = nltk.TrigramTagger(tagged_posts, backoff=bigram_tagger) return tbuar_tagger templates = [ brill.Template(brill.Pos([-1])), brill.Template(brill.Pos([1])), brill.Template(brill.Pos([-2])), brill.Template(brill.Pos([2])), brill.Template(brill.Pos([-2, -1])), brill.Template(brill.Pos([1, 2])), brill.Template(brill.Pos([-3, -2, -1])), brill.Template(brill.Pos([1, 2, 3])), brill.Template(brill.Pos([-1]), brill.Pos([1])), brill.Template(brill.Word([-1])), brill.Template(brill.Word([1])), brill.Template(brill.Word([-2])), brill.Template(brill.Word([2])), brill.Template(brill.Word([-2, -1])), brill.Template(brill.Word([1, 2])), brill.Template(brill.Word([-3, -2, -1])),
print "Unigram accuracy: " print unigram_tagger.evaluate(evaulation_data) # Bigram tagger bigram_tagger = BigramTagger(training_data, backoff=unigram_tagger) print "Bigram accuracy: " print bigram_tagger.evaluate(evaulation_data) # Trigram tagger trigram_tagger = TrigramTagger(training_data, backoff=bigram_tagger) print "Trigram accuracy: " print trigram_tagger.evaluate(evaulation_data) # Brill tagger templates templates = [ Template(brill.Pos([1, 1])), Template(brill.Pos([2, 2])), Template(brill.Pos([1, 2])), Template(brill.Pos([1, 3])), Template(brill.Word([1, 1])), Template(brill.Word([2, 2])), Template(brill.Word([1, 2])), Template(brill.Word([1, 3])), Template(brill.Pos([-1, -1]), brill.Pos([1, 1])), Template(brill.Word([-1, -1]), brill.Word([1, 1])), ] # First iteration trainer = brill_trainer.BrillTaggerTrainer(trigram_tagger, templates) brill_tagger = trainer.train(training_data, max_rules, min_score) print "Initial Brill accuracy:"
word = tokens[index] return nltk.pos_tag([word])[0][1] if word != "" else None custom_pos_tagger = POSTagger() # In[93]: import nltk import nltk.tag from nltk.tag import brill from nltk.tag import UnigramTagger from nltk.tag import BrillTaggerTrainer templates = [ brill.Template(brill.Pos([1, 1])), brill.Template(brill.Pos([2, 2])), brill.Template(brill.Pos([1, 2])), brill.Template(brill.Pos([1, 3])), brill.Template(brill.Pos([1, 1])), brill.Template(brill.Pos([2, 2])), brill.Template(brill.Pos([1, 2])), brill.Template(brill.Pos([1, 3])), brill.Template(brill.Word([-1, -1])), brill.Template(brill.Word([-1, -1])) ] trainer_initial_pos = BrillTaggerTrainer(initial_tagger=custom_pos_tagger, templates=templates, trace=3, deterministic=True)