def train(train_sentences): print "- Default Tagger" default_tagger = DefaultTagger('NC') print "- Unigram Tagger" unigram_tagger = UnigramTagger(train_sentences, backoff=default_tagger) print "- Templates" #These templates define the features to be used for the brill tagger # relatively to the word position. Template._cleartemplates() templates = [ Template(Pos([-1])), Template(Pos([-1]), Word([0])), Template(Pos([-2])), Template(Pos([-2]), Word([0])), Template(Pos([1])), ] print "- Brill Tagger" tt = BrillTaggerTrainer(unigram_tagger, templates, trace=1) tagger = tt.train(train_sentences, max_rules=1000) print "- Done." return tagger
def meta_comparison(nb_iterations): # Taken from http://www.nltk.org/book/ch05.html patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] init_tagger = RegexpTagger(patterns) evaluations = [] for i in range(1, nb_iterations): # https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.brill_trainer Template._cleartemplates() template = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] tt = BrillTaggerTrainer(init_tagger, template, trace=3) currentTagger = tt.train(train_sentences, max_rules=i * 50) current_evaluation = currentTagger.evaluate(test_sentences) evaluations.append(current_evaluation) return evaluations
def __init__(self, args, model_name, load_model=False): super().__init__(args, model_name, load_model) if not load_model: base_tagger = HMM(args, "hmm", load_model=True) if not base_tagger.saved_model_exists(): raise FileNotFoundError(f"Brill base tagger '{base_tagger.model_name}' missing!") features = [ Template(Pos([-1])), Template(Pos([1])), Template(Pos([-2])), Template(Pos([2])), Template(Pos([-2, -1])), Template(Pos([1, 2])), Template(Pos([-3, -2, -1])), Template(Pos([1, 2, 3])), Template(Pos([-1]), Pos([1])), Template(Word([-1])), Template(Word([1])), Template(Word([-2])), Template(Word([2])), Template(Word([-2, -1])), Template(Word([1, 2])), Template(Word([-3, -2, -1])), Template(Word([1, 2, 3])), Template(Word([-1]), Word([1])), ] self.model = nltk.BrillTaggerTrainer(base_tagger.model, features)
def Brill_recursion(nb_iterations): # Taken from http://www.nltk.org/book/ch05.html patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] # init_tagger = CRFTagger(feature_func=feature_func) # init_tagger.train(train_sentences, 'model.crf.tagger') init_tagger = RegexpTagger(patterns) currentTagger = None current_evaluation = 0.0 evaluations = [] for i in range(nb_iterations): #Not sure if we need to use BrillTagger or BrillTaggerTrainer?? #https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.brill_trainer Template._cleartemplates() templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] if i == 0: tt = BrillTaggerTrainer(init_tagger, templates, trace=3) currentTagger = tt.train(train_sentences) current_evaluation = currentTagger.evaluate(test_sentences) evaluations.append(current_evaluation) else: tt = BrillTaggerTrainer(currentTagger, templates, trace=3) tagger = tt.train(train_sentences) current_evaluation = tagger.evaluate(test_sentences) evaluations.append(current_evaluation) currentTagger = tagger print(current_evaluation) return evaluations
def demo_multiposition_feature(): """ The feature/s of a template takes a list of positions relative to the current word where the feature should be looked for, conceptually joined by logical OR. For instance, Pos([-1, 1]), given a value V, will hold whenever V is found one step to the left and/or one step to the right. For contiguous ranges, a 2-arg form giving inclusive end points can also be used: Pos(-3, -1) is the same as the arg below. """ postag(templates=[Template(Pos([-3,-2,-1]))])
def demo_generated_templates(): """ Template.expand and Feature.expand are class methods facilitating generating large amounts of templates. See their documentation for details. Note: training with 500 templates can easily fill all available even on relatively small corpora """ wordtpls = Word.expand([-1,0,1], [1,2], excludezero=False) tagtpls = Pos.expand([-2,-1,0,1], [1,2], excludezero=True) templates = list(Template.expand([wordtpls, tagtpls], combinations=(1,3))) print("Generated {0} templates for transformation-based learning".format(len(templates))) postag(templates=templates, incremental_stats=True, template_stats=True)
def demo_multifeature_template(): """ Templates can have more than a single feature. """ postag(templates=[Template(Word([0]), Pos([-2,-1]))])
import pickle import nltk.tag from nltk.corpus import brown from nltk.tag import CRFTagger from nltk.tbl.template import Template from nltk.tag.brill import Pos, Word from nltk.tag import BrillTaggerTrainer #preparing baseline CRFTagger and trainingData for brill tagger brown_sents = brown.sents() size = int(len(brown_sents) * 0.7) training_data = brown.tagged_sents()[:size] templates = [ Template(Pos([-1])), Template(Pos([1])), Template(Pos([-2])), Template(Pos([2])), Template(Pos([-2, -1])), Template(Pos([1, 2])), Template(Pos([-3, -2, -1])), Template(Pos([1, 2, 3])), Template(Pos([-1]), Pos([1])), Template(Word([-1])), Template(Word([1])), Template(Word([-2])), Template(Word([2])), Template(Word([-2, -1])), Template(Word([1, 2])), Template(Word([-3, -2, -1])),
(r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cadinal numbers (r'.*', 'NN'), # nouns (default) ] regexp_tagger = nltk.RegexpTagger(patterns) ## Part 2: Transformation-based learning and tagging # Define rule templates templates = [ ## original templates ## Template(Pos([-1])), # previous POS tag Template(Pos([-1]), Word([0])), # previous POS tag + current word ## my new templates ## Template(Pos([-2]), Pos([-1])), # previous two POS tags (conjunctive) (0%) Template(Pos([-2, -1])), # previous two POS tags (disjunctive) (<2%) Template(Word([0]), Word([-1])), # current word + previous word (0%) Template(Pos([-2]), Word([0])), # prev prev POS tag + current word (<1%) Template(Word([-1])), # previous word (<0.1%) Template(Pos([-1]), Word([-1])), # previous POS tag + previous word (0%) #Template(Word([0]), Word([1])) # current word + next word (<0%) Template(Pos([-1]),
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) baseline = backoff baseline.evaluate(gold_data) Template._cleartemplates() #clear any templates created in earlier tests templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] tt = BrillTaggerTrainer(baseline, templates, trace=3) tagger1 = tt.train(training_data, max_rules=10) tagger1.rules()[1:3] train_stats = tagger1.train_stats() tagger1.print_template_statistics(printunused=False) tagger1.evaluate(gold_data) tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data) tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99) print(tagger2.evaluate(gold_data)) # doctest: +ELLIPSIS