def trainRegexp(self, backoff): self.split_into_folds() for k in range(1, (self.folds + 1)): train_sents = sum(self.foldlist[: (self.folds - 1)], []) if self.option_tone == "tonal" and self.option_tag == "Affixes": regex = RegexpTonalSA(backoff) if self.option_tone == "tonal" and self.option_tag == "POS": regex = RegexpTonal(backoff) if self.option_tone == "nontonal" and self.option_tag == "Affixes": regex = RegexpSA(backoff) if self.option_tone == "nontonal" and self.option_tag == "POS": regex = Regexp(backoff) to_tag = [untag(i) for i in self.foldlist[self.folds - 1]] self.regex_tagged += regex.tag_sents(to_tag) self.org_tagged += self.foldlist[self.folds - 1] self.foldlist = [self.foldlist[self.folds - 1]] + self.foldlist[: (self.folds - 1)] self.regex = regex self.regex_avg_acc = accuracy(sum(self.org_tagged, []), sum(self.regex_tagged, [])) print("Accuracy of concatenated regexp-tagged sentences: ", self.regex_avg_acc) (self.regex_tagprecision, self.regex_tagrecall) = self.tagprecision_recall( regex, self.regex_tagged, self.org_tagged ) self.org_tagged = [] self.foldlist = [] for i in range(1, self.folds + 1): self.foldlist.append(self.create_fold(i))
def indivRegexp(bambara, option_tag, option_tones, backoff): if option_tones == "tonal" and option_tag == "Affixes": regex=RegexpTonalSA(backoff=backoff) if option_tones == "tonal" and option_tag == "POS": regex=RegexpTonal(backoff=backoff) if option_tones == "nontonal" and option_tag == "Affixes": regex=RegexpSA(backoff=backoff) if option_tones == "nontonal" and option_tag == "POS": regex=Regexp(backoff=backoff) print("Regexp accuracy: ",regex.evaluate(bambara.test_sents)) return regex
def backoff_tagger(num_tagger, bambara, option_tones="tonal", option_tag="POS",backoff=defaultTagger): """ backoff_tagger of the NLTK cookbook [adapted] """ taggers = [] for i in num_tagger: if i == 0: taggers = taggers + [UnigramTagger] if i == 1: taggers = taggers + [BigramTagger] if i == 2: taggers = taggers + [TrigramTagger] if i == 3: taggers = taggers + [QuadgramTagger] if i == 4: taggers+=["crf"] if i == 5: taggers+=["regexp"] if i == 6: taggers+=["dic"] if i == 8: taggers+=["affix"] if i == 9: taggers+=["tnt"] if i == 10: taggers+=["hmm"] #CRF and HMM both do not accept backoff and therefore can only be the last tagger in a backoff chain # -> DefaultTagger has to be substituted if "hmm" in taggers: tag_set= set() symbols=set() for i in bambara.train_sents: for j in i: tag_set.add(j[1]) symbols.add(j[0]) trainer = HiddenMarkovModelTrainer(list(tag_set), list(symbols)) hmm = trainer.train_supervised(bambara.train_sents, estimator=lambda fd, bins:LidstoneProbDist(fd, 0.1, bins)) backoff = hmm taggers.remove("hmm") if "crf" in taggers: backoff = indivCRF(bambara, tone, tag) backoff.train(bambara.train_sents,"model.crfbackoff"+option_tag+option_tones+".tagger") backoff.set_model_file("model.crfbackoff"+option_tag+option_tones+".tagger") taggers.remove("crf") for cls in taggers: if cls != "tnt" and cls!="affix" and cls!="regexp" and cls!="dic": backoff1 = backoff backoff = cls(bambara.train_sents, backoff=backoff1) #print(backoff._taggers) else: if cls == "dic": backoff=dictionary_backoff(option_tones, backoff=backoff) if cls == "regexp": if option_tones == "tonal" and option_tag == "Affixes": backoff=RegexpTonalSA(backoff=backoff) if option_tones == "tonal" and option_tag == "POS": backoff=RegexpTonal(backoff=backoff) if option_tones == "nontonal" and option_tag == "Affixes": backoff=RegexpSA(backoff=backoff) if option_tones == "nontonal" and option_tag == "POS": backoff=Regexp(backoff=backoff) if cls == "affix": backoff = AffixTagger(bambara.train_sents, min_stem_length=0, affix_length=-4, backoff = backoff) if cls == "tnt": backoff = tnt.TnT(unk=backoff, Trained= True, N=100) backoff.train(bambara.train_sents) return backoff