Example #1
0
 def trainRegexp(self, backoff):
     self.split_into_folds()
     for k in range(1, (self.folds + 1)):
         train_sents = sum(self.foldlist[: (self.folds - 1)], [])
         if self.option_tone == "tonal" and self.option_tag == "Affixes":
             regex = RegexpTonalSA(backoff)
         if self.option_tone == "tonal" and self.option_tag == "POS":
             regex = RegexpTonal(backoff)
         if self.option_tone == "nontonal" and self.option_tag == "Affixes":
             regex = RegexpSA(backoff)
         if self.option_tone == "nontonal" and self.option_tag == "POS":
             regex = Regexp(backoff)
         to_tag = [untag(i) for i in self.foldlist[self.folds - 1]]
         self.regex_tagged += regex.tag_sents(to_tag)
         self.org_tagged += self.foldlist[self.folds - 1]
         self.foldlist = [self.foldlist[self.folds - 1]] + self.foldlist[: (self.folds - 1)]
     self.regex = regex
     self.regex_avg_acc = accuracy(sum(self.org_tagged, []), sum(self.regex_tagged, []))
     print("Accuracy of concatenated regexp-tagged sentences: ", self.regex_avg_acc)
     (self.regex_tagprecision, self.regex_tagrecall) = self.tagprecision_recall(
         regex, self.regex_tagged, self.org_tagged
     )
     self.org_tagged = []
     self.foldlist = []
     for i in range(1, self.folds + 1):
         self.foldlist.append(self.create_fold(i))
Example #2
0
def indivRegexp(bambara, option_tag, option_tones, backoff):
    if option_tones == "tonal" and option_tag == "Affixes":
        regex=RegexpTonalSA(backoff=backoff)
    if option_tones == "tonal" and option_tag == "POS":
        regex=RegexpTonal(backoff=backoff)
    if option_tones == "nontonal" and option_tag == "Affixes":
        regex=RegexpSA(backoff=backoff)
    if option_tones == "nontonal" and option_tag == "POS":
        regex=Regexp(backoff=backoff)
    print("Regexp accuracy: ",regex.evaluate(bambara.test_sents))
    return regex
Example #3
0
def backoff_tagger(num_tagger, bambara, option_tones="tonal", option_tag="POS",backoff=defaultTagger):
    """ backoff_tagger of the NLTK cookbook [adapted] """
    taggers = []
    for i in num_tagger:
        if i == 0:
            taggers = taggers + [UnigramTagger]
        if i == 1:
            taggers = taggers + [BigramTagger]
        if i == 2:
            taggers = taggers + [TrigramTagger]
        if i == 3:
            taggers = taggers + [QuadgramTagger]
        if i == 4:
            taggers+=["crf"]
        if i == 5:
            taggers+=["regexp"]
        if i == 6:
            taggers+=["dic"]
        if i == 8:
            taggers+=["affix"]
        if i == 9:
            taggers+=["tnt"]
        if i == 10:
            taggers+=["hmm"]
    #CRF and HMM both do not accept backoff and therefore can only be the last tagger in a backoff chain
    # -> DefaultTagger has to be substituted
    if "hmm" in taggers:
        tag_set= set()
        symbols=set()
        for i in bambara.train_sents:
            for j in i:
                tag_set.add(j[1])
                symbols.add(j[0])
        trainer = HiddenMarkovModelTrainer(list(tag_set), list(symbols))
        hmm = trainer.train_supervised(bambara.train_sents, estimator=lambda fd, bins:LidstoneProbDist(fd, 0.1, bins))
        backoff = hmm
        taggers.remove("hmm")
    if "crf" in taggers:
        backoff = indivCRF(bambara, tone, tag) 
        backoff.train(bambara.train_sents,"model.crfbackoff"+option_tag+option_tones+".tagger")
        backoff.set_model_file("model.crfbackoff"+option_tag+option_tones+".tagger")
        taggers.remove("crf")                                              
    for cls in taggers:
        if cls != "tnt" and cls!="affix" and cls!="regexp" and cls!="dic":
            backoff1 = backoff
            backoff = cls(bambara.train_sents, backoff=backoff1)
            #print(backoff._taggers)
        else:
            if cls == "dic":
                backoff=dictionary_backoff(option_tones, backoff=backoff)
            if cls == "regexp":
                if option_tones == "tonal" and option_tag == "Affixes":
                    backoff=RegexpTonalSA(backoff=backoff)
                if option_tones == "tonal" and option_tag == "POS":
                    backoff=RegexpTonal(backoff=backoff)
                if option_tones == "nontonal" and option_tag == "Affixes":
                    backoff=RegexpSA(backoff=backoff)
                if option_tones == "nontonal" and option_tag == "POS":
                    backoff=Regexp(backoff=backoff)
            if cls == "affix":
                backoff = AffixTagger(bambara.train_sents, min_stem_length=0, affix_length=-4, backoff = backoff)
            if cls == "tnt":
                backoff = tnt.TnT(unk=backoff, Trained= True, N=100)
                backoff.train(bambara.train_sents)
    return backoff