def template_comparison(nb_iterations): # Taken from http://www.nltk.org/book/ch05.html patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] init_tagger = RegexpTagger(patterns) templates = [ nltk.tag.brill.nltkdemo18(), nltk.tag.brill.nltkdemo18plus(), nltk.tag.brill.fntbl37(), nltk.tag.brill.brill24() ] evaluations = [] for t in templates: # https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.brill_trainer Template._cleartemplates() tt = BrillTaggerTrainer(init_tagger, t, trace=3) currentTagger = tt.train(train_sentences) current_evaluation = currentTagger.evaluate(test_sentences) evaluations.append(current_evaluation) return evaluations
def meta_comparison(nb_iterations): # Taken from http://www.nltk.org/book/ch05.html patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] init_tagger = RegexpTagger(patterns) evaluations = [] for i in range(1, nb_iterations): # https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.brill_trainer Template._cleartemplates() template = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] tt = BrillTaggerTrainer(init_tagger, template, trace=3) currentTagger = tt.train(train_sentences, max_rules=i * 50) current_evaluation = currentTagger.evaluate(test_sentences) evaluations.append(current_evaluation) return evaluations
def train(train_sentences): print "- Default Tagger" default_tagger = DefaultTagger('NC') print "- Unigram Tagger" unigram_tagger = UnigramTagger(train_sentences, backoff=default_tagger) print "- Templates" #These templates define the features to be used for the brill tagger # relatively to the word position. Template._cleartemplates() templates = [ Template(Pos([-1])), Template(Pos([-1]), Word([0])), Template(Pos([-2])), Template(Pos([-2]), Word([0])), Template(Pos([1])), ] print "- Brill Tagger" tt = BrillTaggerTrainer(unigram_tagger, templates, trace=1) tagger = tt.train(train_sentences, max_rules=1000) print "- Done." return tagger
def Brill_recursion(nb_iterations): # Taken from http://www.nltk.org/book/ch05.html patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] # init_tagger = CRFTagger(feature_func=feature_func) # init_tagger.train(train_sentences, 'model.crf.tagger') init_tagger = RegexpTagger(patterns) currentTagger = None current_evaluation = 0.0 evaluations = [] for i in range(nb_iterations): #Not sure if we need to use BrillTagger or BrillTaggerTrainer?? #https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.brill_trainer Template._cleartemplates() templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] if i == 0: tt = BrillTaggerTrainer(init_tagger, templates, trace=3) currentTagger = tt.train(train_sentences) current_evaluation = currentTagger.evaluate(test_sentences) evaluations.append(current_evaluation) else: tt = BrillTaggerTrainer(currentTagger, templates, trace=3) tagger = tt.train(train_sentences) current_evaluation = tagger.evaluate(test_sentences) evaluations.append(current_evaluation) currentTagger = tagger print(current_evaluation) return evaluations
def create_tagger(train_sents): t0 = nltk.DefaultTagger('S') t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) t3 = nltk.TrigramTagger(train_sents, backoff=t2) if brill_value is True: Template._cleartemplates() templates = [ #REDUIT# Template(Word([0]), Word([-1])), Template(Word([0]), Word([1])), ] t4 = BrillTaggerTrainer(t3, templates, trace=3) tagger = t4.train(train_sents, max_rules=20, min_score=0, min_acc=None) else: tagger = t3 return tagger
def __init__(self, tagged_sents, anonProperNouns=False, initialTagger=None, max_rules=250, min_score=2, min_acc=None, template='fntbl37'): ''' Construct a new MTEBrillTagger and train it with the sentences from tagged_sents. :param tagged_sents: Tagged sentences to train the tagger. :type tagged_sents: [[(word:str, tag:str)]] :param anonProperNouns: Set 'True' to replace every proper noun with an anonymous string. Currently only for MTE tags. :type anonProperNouns: bool :param initialTagger: If None or unset, use UnigramTagger as initial tagger; use specified one else ('self._tagger = initialTagger') :type initialTagger: Tagger :param max_rules: tagger generates at most max_rules rules :type max_rules: int :param min_score: stop training when no rules better than min_score can be found :type min_score: int :param min_acc: discard any rule with lower accuracy than min_acc :type min_acc: float or None :param template: template set to use to train Brill Tagger. Can be the name of a function from nltk.tag.brill that returns a template set or a list of templates. :type template: str or list ''' self._tagged_sents = [] ANON = "anon" if anonProperNouns: for s in tagged_sents: tmp = [] for (w, tag) in s: if tag.startswith("#Np"): tmp.append((ANON, "#Np")) else: tmp.append((w, tag)) self._tagged_sents.append(tmp) else: self._tagged_sents = tagged_sents Template._cleartemplates() # If 'template' parameter is 'None' default to fntbl37 template set if template is None: templates = fntbl37() # Check if 'template' parameter is a list. If it is try to use it directly elif type(template) is list: templates = template # Check if 'template' is a string. If it is try to get the template set from nltk elif type(template) is str: if template == "fntbl37": templates = fntbl37() elif template == "brill24": templates = brill24() elif template == "nltkdemo18": templates = nltkdemo18() elif template == "nltkdemo18plus": templates = nltkdemo18plus() elif template == "baseline": templates = None else: raise ValueError("Method returning templates not found!") # If it is any other type, raise error else: raise ValueError( "Please specify the name of a function that returns a list of templates or a list of templates directly!") if initialTagger is None: self._tagger = UnigramTagger(self._tagged_sents) else: self._tagger = initialTagger if templates is not None: self._tagger = BrillTaggerTrainer(self._tagger, templates, trace=3) self._tagger = self._tagger.train(self._tagged_sents, max_rules=max_rules, min_score=min_score, min_acc=min_acc)
import pickle import nltk.tag from nltk.corpus import brown from nltk.tag import CRFTagger from nltk.tbl.template import Template from nltk.tag.brill import Pos, Word from nltk.tag import BrillTaggerTrainer #preparing baseline CRFTagger and trainingData for brill tagger brown_sents = brown.sents() size = int(len(brown_sents) * 0.7) training_data = brown.tagged_sents()[:size] templates = [ Template(Pos([-1])), Template(Pos([1])), Template(Pos([-2])), Template(Pos([2])), Template(Pos([-2, -1])), Template(Pos([1, 2])), Template(Pos([-3, -2, -1])), Template(Pos([1, 2, 3])), Template(Pos([-1]), Pos([1])), Template(Word([-1])), Template(Word([1])), Template(Word([-2])), Template(Word([2])), Template(Word([-2, -1])), Template(Word([1, 2])), Template(Word([-3, -2, -1])),
backoff = RegexpTagger([ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) baseline = backoff baseline.evaluate(gold_data) Template._cleartemplates() #clear any templates created in earlier tests templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] tt = BrillTaggerTrainer(baseline, templates, trace=3) tagger1 = tt.train(training_data, max_rules=10) tagger1.rules()[1:3] train_stats = tagger1.train_stats() tagger1.print_template_statistics(printunused=False) tagger1.evaluate(gold_data) tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data) tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99)
random.shuffle(bloques) for iter in range(10): test = bloques[iter] train = [] for element in bloques: if element != test: for item in element: train.append(item) # Entrenamiento del etiquetador # Brill tagger baseline_data = train baseline = UnigramTagger(baseline_data) #baseline = hmm.HiddenMarkovModelTagger.train(baseline_data) templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] tagger_brill_tr = brill_trainer.BrillTaggerTrainer(initial_tagger=baseline, templates=templates, trace=3) tagger_brill = tagger_brill_tr.train(train, max_rules=10) ''' # CRF tagger tagger_crf = crf.CRFTagger() tagger_crf.train(train, "model") print("CRF Fold", iter) ''' ''' # Perceptron tagger tagger_perceptron = perceptron.PerceptronTagger(load = False) tagger_perceptron.train(train) '''
def __init__(self, args, model_name, load_model=False): super().__init__(args, model_name, load_model) if not load_model: base_tagger = HMM(args, "hmm", load_model=True) if not base_tagger.saved_model_exists(): raise FileNotFoundError(f"Brill base tagger '{base_tagger.model_name}' missing!") features = [ Template(Pos([-1])), Template(Pos([1])), Template(Pos([-2])), Template(Pos([2])), Template(Pos([-2, -1])), Template(Pos([1, 2])), Template(Pos([-3, -2, -1])), Template(Pos([1, 2, 3])), Template(Pos([-1]), Pos([1])), Template(Word([-1])), Template(Word([1])), Template(Word([-2])), Template(Word([2])), Template(Word([-2, -1])), Template(Word([1, 2])), Template(Word([-3, -2, -1])), Template(Word([1, 2, 3])), Template(Word([-1]), Word([1])), ] self.model = nltk.BrillTaggerTrainer(base_tagger.model, features)