def indivCRF(bambara, tone, tag): crf = CRFTagger(training_opt={'max_iterations':100,'max_linesearch' : 10,'c1': 0.0001,'c2': 1.0})#best training_opt für CRF # c1 and c2 according to suggestion on http://nbviewer.ipython.org/github/tpeng/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb print("Training CRF Tagger...") crf.train(bambara.train_sents, "Models/model.indivCRF"+tone+tag+".tagger") print("CRF accuracy: ",crf.evaluate(bambara.test_sents)) return crf
def trainALL(self, last): self.split_into_folds() for k in range(1, (self.folds + 1)): train_sents = sum(self.foldlist[: (self.folds - 1)], []) crf = CRFTagger(training_opt={"max_iterations": 100, "max_linesearch": 10, "c1": 0.0001, "c2": 1.0}) crf_trained = crf.train( train_sents, "Models/model.crfCrossValidation1" + str(k) + self.option_tone + self.option_tag + ".tagger", ) print(str(k) + " fold: crf") tnt_tagger = tnt.TnT(unk=DefaultTagger("n"), Trained=True, N=100) tnt_tagger.train(train_sents) print(str(k) + " fold: tnt") tag_set = set() symbols = set() for i in train_sents: for j in i: tag_set.add(j[1]) symbols.add(j[0]) trainer = HiddenMarkovModelTrainer(list(tag_set), list(symbols)) hmm = trainer.train_supervised(train_sents, estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins)) print(str(k) + " fold: hmm") if last == "U": lasttagger = UnigramTagger(train_sents, backoff=DefaultTagger("n")) print(str(k) + " fold: unigram") if last == "B": if self.option_tone == "tonal" and self.option_tag == "Affixes": regex = RegexpTonalSA(DefaultTagger("n")) if self.option_tone == "tonal" and self.option_tag == "POS": regex = RegexpTonal(DefaultTagger("n")) if self.option_tone == "nontonal" and self.option_tag == "Affixes": regex = RegexpSA(DefaultTagger("n")) if self.option_tone == "nontonal" and self.option_tag == "POS": regex = Regexp(DefaultTagger("n")) dic = dictionary_backoff(self.option_tone, regex) affix = AffixTagger(train_sents, min_stem_length=0, affix_length=-4, backoff=dic) lasttagger = BigramTagger(train_sents, backoff=affix) print(str(k) + " fold: bigram") to_tag = [untag(i) for i in self.foldlist[self.folds - 1]] self.crf_tagged += crf.tag_sents(to_tag) self.tnt_tagged += tnt_tagger.tag_sents(to_tag) self.hmm_tagged += hmm.tag_sents(to_tag) self.lasttagger_tagged += lasttagger.tag_sents(to_tag) self.org_tagged += self.foldlist[self.folds - 1] self.foldlist = [self.foldlist[self.folds - 1]] + self.foldlist[: (self.folds - 1)] self.crf = crf self.tnt = tnt_tagger self.hmm = hmm self.lasttagger = lasttagger org_words = sum(self.org_tagged, []) self.crf_avg_acc = accuracy(org_words, sum(self.crf_tagged, [])) self.tnt_avg_acc = accuracy(org_words, sum(self.tnt_tagged, [])) self.hmm_avg_acc = accuracy(org_words, sum(self.hmm_tagged, [])) self.lasttagger_avg_acc = accuracy(org_words, sum(self.lasttagger_tagged, [])) print("Accuracy of concatenated crf-tagged sentences: ", self.crf_avg_acc) print("Accuracy of concatenated tnt-tagged sentences: ", self.tnt_avg_acc) print("Accuracy of concatenated hmm-tagged sentences: ", self.hmm_avg_acc) print("Accuracy of concatenated " + last + "-tagged sentences: ", self.lasttagger_avg_acc) (self.crf_tagprecision, self.crf_tagrecall) = self.tagprecision_recall(crf, self.crf_tagged, self.org_tagged) (self.tnt_tagprecision, self.tnt_tagrecall) = self.tagprecision_recall( tnt_tagger, self.tnt_tagged, self.org_tagged ) (self.hmm_tagprecision, self.hmm_tagrecall) = self.tagprecision_recall(hmm, self.hmm_tagged, self.org_tagged) (self.lasttagger_tagprecision, self.lasttagger_tagrecall) = self.tagprecision_recall( lasttagger, self.lasttagger_tagged, self.org_tagged ) self.org_tagged = [] self.foldlist = [] for i in range(1, self.folds + 1): self.foldlist.append(self.create_fold(i))