def train(train_sentences): print "- Default Tagger" default_tagger = DefaultTagger('NC') print "- Unigram Tagger" unigram_tagger = UnigramTagger(train_sentences, backoff=default_tagger) print "- Templates" #These templates define the features to be used for the brill tagger # relatively to the word position. Template._cleartemplates() templates = [ Template(Pos([-1])), Template(Pos([-1]), Word([0])), Template(Pos([-2])), Template(Pos([-2]), Word([0])), Template(Pos([1])), ] print "- Brill Tagger" tt = BrillTaggerTrainer(unigram_tagger, templates, trace=1) tagger = tt.train(train_sentences, max_rules=1000) print "- Done." return tagger
def __init__(self, args, model_name, load_model=False): super().__init__(args, model_name, load_model) if not load_model: base_tagger = HMM(args, "hmm", load_model=True) if not base_tagger.saved_model_exists(): raise FileNotFoundError(f"Brill base tagger '{base_tagger.model_name}' missing!") features = [ Template(Pos([-1])), Template(Pos([1])), Template(Pos([-2])), Template(Pos([2])), Template(Pos([-2, -1])), Template(Pos([1, 2])), Template(Pos([-3, -2, -1])), Template(Pos([1, 2, 3])), Template(Pos([-1]), Pos([1])), Template(Word([-1])), Template(Word([1])), Template(Word([-2])), Template(Word([2])), Template(Word([-2, -1])), Template(Word([1, 2])), Template(Word([-3, -2, -1])), Template(Word([1, 2, 3])), Template(Word([-1]), Word([1])), ] self.model = nltk.BrillTaggerTrainer(base_tagger.model, features)
def meta_comparison(nb_iterations): # Taken from http://www.nltk.org/book/ch05.html patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] init_tagger = RegexpTagger(patterns) evaluations = [] for i in range(1, nb_iterations): # https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.brill_trainer Template._cleartemplates() template = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] tt = BrillTaggerTrainer(init_tagger, template, trace=3) currentTagger = tt.train(train_sentences, max_rules=i * 50) current_evaluation = currentTagger.evaluate(test_sentences) evaluations.append(current_evaluation) return evaluations
def brill_rules_pos_wd_feats_offset_4(): """ Return 24 templates of the seminal TBL paper, Brill (1995) """ return [ Template(Word([-1])), Template(Word([-2])), Template(Word([-3])), Template(Word([-4])), Template(Word([0])), Template(Word([1])), Template(Word([2])), Template(Word([3])), Template(Word([4])), ]
def brill_rules_pos_bigram_feats_offset_4(): """ Return 24 templates of the seminal TBL paper, Brill (1995) """ return [ Template(Word([-1, 0])), Template(Word([-2, -1])), Template(Word([-3, -2])), Template(Word([-4, -3])), Template(Word([1, 0])), Template(Word([2, 1])), Template(Word([3, 2])), Template(Word([4, 3])) ]
def create_tagger(train_sents): t0 = nltk.DefaultTagger('S') t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) t3 = nltk.TrigramTagger(train_sents, backoff=t2) if brill_value is True: Template._cleartemplates() templates = [ #REDUIT# Template(Word([0]), Word([-1])), Template(Word([0]), Word([1])), ] t4 = BrillTaggerTrainer(t3, templates, trace=3) tagger = t4.train(train_sents, max_rules=20, min_score=0, min_acc=None) else: tagger = t3 return tagger
def demo_generated_templates(): """ Template.expand and Feature.expand are class methods facilitating generating large amounts of templates. See their documentation for details. Note: training with 500 templates can easily fill all available even on relatively small corpora """ wordtpls = Word.expand([-1,0,1], [1,2], excludezero=False) tagtpls = Pos.expand([-2,-1,0,1], [1,2], excludezero=True) templates = list(Template.expand([wordtpls, tagtpls], combinations=(1,3))) print("Generated {0} templates for transformation-based learning".format(len(templates))) postag(templates=templates, incremental_stats=True, template_stats=True)
def Brill_recursion(nb_iterations): # Taken from http://www.nltk.org/book/ch05.html patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] # init_tagger = CRFTagger(feature_func=feature_func) # init_tagger.train(train_sentences, 'model.crf.tagger') init_tagger = RegexpTagger(patterns) currentTagger = None current_evaluation = 0.0 evaluations = [] for i in range(nb_iterations): #Not sure if we need to use BrillTagger or BrillTaggerTrainer?? #https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.brill_trainer Template._cleartemplates() templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] if i == 0: tt = BrillTaggerTrainer(init_tagger, templates, trace=3) currentTagger = tt.train(train_sentences) current_evaluation = currentTagger.evaluate(test_sentences) evaluations.append(current_evaluation) else: tt = BrillTaggerTrainer(currentTagger, templates, trace=3) tagger = tt.train(train_sentences) current_evaluation = tagger.evaluate(test_sentences) evaluations.append(current_evaluation) currentTagger = tagger print(current_evaluation) return evaluations
def demo_multifeature_template(): """ Templates can have more than a single feature. """ postag(templates=[Template(Word([0]), Pos([-2,-1]))])
brown_sents = brown.sents() size = int(len(brown_sents) * 0.7) training_data = brown.tagged_sents()[:size] templates = [ Template(Pos([-1])), Template(Pos([1])), Template(Pos([-2])), Template(Pos([2])), Template(Pos([-2, -1])), Template(Pos([1, 2])), Template(Pos([-3, -2, -1])), Template(Pos([1, 2, 3])), Template(Pos([-1]), Pos([1])), Template(Word([-1])), Template(Word([1])), Template(Word([-2])), Template(Word([2])), Template(Word([-2, -1])), Template(Word([1, 2])), Template(Word([-3, -2, -1])), Template(Word([1, 2, 3])), Template(Word([-1]), Word([1])) ] baseline = CRFTagger() baseline.set_model_file("model.crf.tagger") #training brill tagger
(r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cadinal numbers (r'.*', 'NN'), # nouns (default) ] regexp_tagger = nltk.RegexpTagger(patterns) ## Part 2: Transformation-based learning and tagging # Define rule templates templates = [ ## original templates ## Template(Pos([-1])), # previous POS tag Template(Pos([-1]), Word([0])), # previous POS tag + current word ## my new templates ## Template(Pos([-2]), Pos([-1])), # previous two POS tags (conjunctive) (0%) Template(Pos([-2, -1])), # previous two POS tags (disjunctive) (<2%) Template(Word([0]), Word([-1])), # current word + previous word (0%) Template(Pos([-2]), Word([0])), # prev prev POS tag + current word (<1%) Template(Word([-1])), # previous word (<0.1%) Template(Pos([-1]), Word([-1])), # previous POS tag + previous word (0%) #Template(Word([0]), Word([1])) # current word + next word (<0%) Template(Pos([-1]), Pos([0])), # previous POS tag + current POS tag (0%)
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) baseline = backoff baseline.evaluate(gold_data) Template._cleartemplates() #clear any templates created in earlier tests templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] tt = BrillTaggerTrainer(baseline, templates, trace=3) tagger1 = tt.train(training_data, max_rules=10) tagger1.rules()[1:3] train_stats = tagger1.train_stats() tagger1.print_template_statistics(printunused=False) tagger1.evaluate(gold_data) tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data) tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99) print(tagger2.evaluate(gold_data)) # doctest: +ELLIPSIS