def train_hmm(train_set, observations, index_features):
    '''Training the hmms...'''
    # symbols is a vector of inputs, each input is a vector of features (requires to be tuples by nltk)
    # we don't need to specify the states so we choose 1 and 2
    trainer = HiddenMarkovModelTrainer(states=[1, 2], symbols=observations)
    hmms = {}
    for cat in train_set.keys():
        print "Training HMM of cat:", cat
        tuple_sentences = []
        for sentence in train_set[cat]:
            '''tuple_sentence = [(tuple(word),'') for word in sentence]
            tuple_sentences.append(tuple_sentence)
            '''
            '''feature subset selection'''
            new_sentence = []
            for word in sentence:
                new_word = []
                for feature in index_features:
                    new_word.append(word[feature])
                new_sentence.append(new_word)
            tuple_sentence = [(tuple(word), '') for word in new_sentence]
            tuple_sentences.append(tuple_sentence)

            # sentence is a list of list! so w is a list of feature not only a word!
        hmms[cat] = trainer.train_unsupervised(tuple_sentences,
                                               max_iterations=10)
    return hmms
コード例 #2
0
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold):

    projection = lambda x: x
    if STEM:
        projection = stem

    # Start Training
    print("Fold %i Training code" % fold)

    # For training
    td_sents = to_label_powerset_tagged_sentences(essays_TD, regular_tags, projection=projection)
    vd_sents = to_label_powerset_tagged_sentences(essays_VD, regular_tags, projection=projection)

    trainer = HiddenMarkovModelTrainer()
    model = trainer.train_supervised(td_sents)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents, regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents, regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
コード例 #3
0
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold):

    projection = lambda x: x
    if STEM:
        projection = stem

    # Start Training
    print("Fold %i Training code" % fold)

    # Important - only compute code frequency from training data (NO CHEATING)
    code_freq = tally_code_frequencies(essays_TD)

    # For training
    td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq, projection=projection)
    vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq, projection=projection)

    trainer = HiddenMarkovModelTrainer()
    model = trainer.train_supervised(td_sents)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags)
    vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags)

    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
コード例 #4
0
def train_hmm(train_set, observations, index_features):
    """Training the hmms..."""
    # symbols is a vector of inputs, each input is a vector of features (requires to be tuples by nltk)
    # we don't need to specify the states so we choose 1 and 2
    trainer = HiddenMarkovModelTrainer(states=[1, 2], symbols=observations)
    hmms = {}
    for cat in train_set.keys():
        print "Training HMM of cat:", cat
        tuple_sentences = []
        for sentence in train_set[cat]:

            """tuple_sentence = [(tuple(word),'') for word in sentence]
            tuple_sentences.append(tuple_sentence)
            """
            """feature subset selection"""
            new_sentence = []
            for word in sentence:
                new_word = []
                for feature in index_features:
                    new_word.append(word[feature])
                new_sentence.append(new_word)
            tuple_sentence = [(tuple(word), "") for word in new_sentence]
            tuple_sentences.append(tuple_sentence)

            # sentence is a list of list! so w is a list of feature not only a word!
        hmms[cat] = trainer.train_unsupervised(tuple_sentences, max_iterations=10)
    return hmms
コード例 #5
0
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold):

    projection = lambda x: x
    if STEM:
        projection = stem

    # Start Training
    print("Fold %i Training code" % fold)

    # For training
    td_sents = to_label_powerset_tagged_sentences(essays_TD, regular_tags, projection=projection)
    vd_sents = to_label_powerset_tagged_sentences(essays_VD, regular_tags, projection=projection)

    trainer = HiddenMarkovModelTrainer()
    model = trainer.train_supervised(td_sents)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents, regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents, regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
コード例 #6
0
ファイル: indivTaggers.py プロジェクト: Batene/Bamanankan
def indivHMM(bambara):
    tag_set= set()
    symbols=set()
    for i in bambara.train_sents:
        for j in i:
            tag_set.add(j[1])
            symbols.add(j[0])
    trainer = HiddenMarkovModelTrainer(list(tag_set), list(symbols))
    hmm = trainer.train_supervised(bambara.train_sents, estimator=lambda fd, bins:LidstoneProbDist(fd, 0.1, bins))
    print("HMM accuracy:",hmm.evaluate(bambara.test_sents))
    return hmm
コード例 #7
0
def train(train_set, word_types, tag_set):
    """
    Training...
    Called this way, the HMM knows the whole set of tags and the whole set of words (no "unknown" word and/or tag during test)
    """
    trainer = HiddenMarkovModelTrainer(list(tag_set), list(
        word_types))  # tag_set and word_types are sets: I need to create lists
    # GoodTuring smoothing
    # see: https://nltk.googlecode.com/svn/trunk/doc/api/nltk.probability.SimpleGoodTuringProbDist-class.html
    #      http://en.wikipedia.org/wiki/Additive_smoothing
    hmm = trainer.train_supervised(
        train_set,
        estimator=lambda fd, bins: SimpleGoodTuringProbDist(fd, bins))
    return hmm
コード例 #8
0
    def train(self, labeled_sequence):
        def estimator(fd, bins):
            return LidstoneProbDist(fd, 0.1, bins)

        labeled_sequence = LazyMap(_identity, labeled_sequence)
        symbols = unique_list(word for sent in labeled_sequence for word, tag in sent)
        tag_set = unique_list(tag for sent in labeled_sequence for word, tag in sent)

        trainer = HiddenMarkovModelTrainer(tag_set, symbols)
        hmm = trainer.train_supervised(labeled_sequence, estimator=estimator)
        hmm = HiddenMarkovModelTagger(
            hmm._symbols,
            hmm._states,
            hmm._transitions,
            hmm._outputs,
            hmm._priors,
            transform=_identity,
        )
        self.tagger = hmm
コード例 #9
0
for file in glob.glob("twitie-tagger/corpora/*agree"):
	print (file)
	f=open(file)
	lines = [line.strip().split() for line in f]
	f.close()

	tokenized_docs = tokenized_docs + [[word.split("_")[-2:] for word in line if len(word)>1] for line in lines]


tokenized_docs_tuples = [[tuple(word) for word in line] for line in tokenized_docs]

for sent in tokenized_docs_tuples:
	for word in sent:
		if len(word) != 2:
			print (word)

words = [word[0] for line in tokenized_docs for word in line]
wordsVocab = list(set(words))
states = [word[1] for line in tokenized_docs for word in line if len(word)>1]
statesVocab = list(set(states))

#HMMtrainer = HiddenMarkovModelTrainer(states=statesVocab,symbols=wordsVocab)

HMMtrainer = HiddenMarkovModelTrainer()
hmmmodel = HMMtrainer.train(tokenized_docs_tuples)


#print (hmmmodel.tag("wtf did u do ?".split()))
#sentence = "my home is burning".split()
#print (hmmmodel.tag(sentence))
コード例 #10
0
    td_sents_by_code = to_tagged_sentences_by_code(essays_TD, regular_tags, projection=projection)
    vd_sents_by_code = to_tagged_sentences_by_code(essays_VD, regular_tags, projection=projection)

    code2model = dict()
    fold_models.append(code2model)

    wd_td_ys_bytag = dict()
    wd_vd_ys_bytag = dict()
    td_wd_predictions_by_code = dict()
    vd_wd_predictions_by_code = dict()

    for code in sorted(regular_tags):
        print("Fold %i Training code: %s" % (fold, code))
        td, vd = td_sents_by_code[code], vd_sents_by_code[code]

        trainer = HiddenMarkovModelTrainer()
        model = trainer.train_supervised(td)
        code2model[code] = model

        wd_td_ys_bytag[code] = to_flattened_binary_tags(td)
        wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd)

        td_predictions = model.tag_sents(to_sentences(td))
        vd_predictions = model.tag_sents(to_sentences(vd))

        td_wd_predictions_by_code[code] = to_flattened_binary_tags(td_predictions)
        vd_wd_predictions_by_code[code] = to_flattened_binary_tags(vd_predictions)

    merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag)
    merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag)
    merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag)
コード例 #11
0
C1_sequences = []
C2_sequences = []
C3_sequences = []

for i in range(len(C1_train_cipher)):
        C1_sequences.append(zip(C1_train_cipher[i],C1_train_plain[i]))
        
for i in range(len(C2_train_cipher)):
        C2_sequences.append(zip(C2_train_cipher[i],C2_train_plain[i]))
        
for i in range(len(C3_train_cipher)):
        C3_sequences.append(zip(C3_train_cipher[i],C3_train_plain[i]))        
        

trainer = HiddenMarkovModelTrainer(symbols,states)
print("################## Analysis of Ciphers without improved Plaintext modelling ####################### \n")

if(laplace_mode):
    print("################## Laplace ####################### \n")
    C1_tagger = trainer.train_supervised(C1_sequences, estimator= nltk.probability.LaplaceProbDist)
    C2_tagger = trainer.train_supervised(C2_sequences, estimator= nltk.probability.LaplaceProbDist)
    C3_tagger = trainer.train_supervised(C3_sequences, estimator= nltk.probability.LaplaceProbDist)
else:
    C1_tagger = trainer.train_supervised(C1_sequences)
    C2_tagger = trainer.train_supervised(C2_sequences)
    C3_tagger = trainer.train_supervised(C3_sequences)

C1_tester = []
C2_tester = []
C3_tester = []
コード例 #12
0
ファイル: CrossValidation.py プロジェクト: Batene/Bamanankan
 def trainALL(self, last):
     self.split_into_folds()
     for k in range(1, (self.folds + 1)):
         train_sents = sum(self.foldlist[: (self.folds - 1)], [])
         crf = CRFTagger(training_opt={"max_iterations": 100, "max_linesearch": 10, "c1": 0.0001, "c2": 1.0})
         crf_trained = crf.train(
             train_sents,
             "Models/model.crfCrossValidation1" + str(k) + self.option_tone + self.option_tag + ".tagger",
         )
         print(str(k) + " fold: crf")
         tnt_tagger = tnt.TnT(unk=DefaultTagger("n"), Trained=True, N=100)
         tnt_tagger.train(train_sents)
         print(str(k) + " fold: tnt")
         tag_set = set()
         symbols = set()
         for i in train_sents:
             for j in i:
                 tag_set.add(j[1])
                 symbols.add(j[0])
         trainer = HiddenMarkovModelTrainer(list(tag_set), list(symbols))
         hmm = trainer.train_supervised(train_sents, estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))
         print(str(k) + " fold: hmm")
         if last == "U":
             lasttagger = UnigramTagger(train_sents, backoff=DefaultTagger("n"))
             print(str(k) + " fold: unigram")
         if last == "B":
             if self.option_tone == "tonal" and self.option_tag == "Affixes":
                 regex = RegexpTonalSA(DefaultTagger("n"))
             if self.option_tone == "tonal" and self.option_tag == "POS":
                 regex = RegexpTonal(DefaultTagger("n"))
             if self.option_tone == "nontonal" and self.option_tag == "Affixes":
                 regex = RegexpSA(DefaultTagger("n"))
             if self.option_tone == "nontonal" and self.option_tag == "POS":
                 regex = Regexp(DefaultTagger("n"))
             dic = dictionary_backoff(self.option_tone, regex)
             affix = AffixTagger(train_sents, min_stem_length=0, affix_length=-4, backoff=dic)
             lasttagger = BigramTagger(train_sents, backoff=affix)
             print(str(k) + " fold: bigram")
         to_tag = [untag(i) for i in self.foldlist[self.folds - 1]]
         self.crf_tagged += crf.tag_sents(to_tag)
         self.tnt_tagged += tnt_tagger.tag_sents(to_tag)
         self.hmm_tagged += hmm.tag_sents(to_tag)
         self.lasttagger_tagged += lasttagger.tag_sents(to_tag)
         self.org_tagged += self.foldlist[self.folds - 1]
         self.foldlist = [self.foldlist[self.folds - 1]] + self.foldlist[: (self.folds - 1)]
     self.crf = crf
     self.tnt = tnt_tagger
     self.hmm = hmm
     self.lasttagger = lasttagger
     org_words = sum(self.org_tagged, [])
     self.crf_avg_acc = accuracy(org_words, sum(self.crf_tagged, []))
     self.tnt_avg_acc = accuracy(org_words, sum(self.tnt_tagged, []))
     self.hmm_avg_acc = accuracy(org_words, sum(self.hmm_tagged, []))
     self.lasttagger_avg_acc = accuracy(org_words, sum(self.lasttagger_tagged, []))
     print("Accuracy of concatenated crf-tagged sentences: ", self.crf_avg_acc)
     print("Accuracy of concatenated tnt-tagged sentences: ", self.tnt_avg_acc)
     print("Accuracy of concatenated hmm-tagged sentences: ", self.hmm_avg_acc)
     print("Accuracy of concatenated " + last + "-tagged sentences: ", self.lasttagger_avg_acc)
     (self.crf_tagprecision, self.crf_tagrecall) = self.tagprecision_recall(crf, self.crf_tagged, self.org_tagged)
     (self.tnt_tagprecision, self.tnt_tagrecall) = self.tagprecision_recall(
         tnt_tagger, self.tnt_tagged, self.org_tagged
     )
     (self.hmm_tagprecision, self.hmm_tagrecall) = self.tagprecision_recall(hmm, self.hmm_tagged, self.org_tagged)
     (self.lasttagger_tagprecision, self.lasttagger_tagrecall) = self.tagprecision_recall(
         lasttagger, self.lasttagger_tagged, self.org_tagged
     )
     self.org_tagged = []
     self.foldlist = []
     for i in range(1, self.folds + 1):
         self.foldlist.append(self.create_fold(i))
コード例 #13
0
# coding:utf-8
from nltk.tag.hmm import HiddenMarkovModelTrainer

line_count = 0
pairs = []

with open("../data/may_norm_sentences.txt", "r") as f:
    for line in f:

        if line_count % 1000 == 0:
            print line_count

        line_count += 1
        splitted = line.strip().decode("utf-8").split(';')

        pairs.append(zip(splitted[1].split(" "), splitted[0].split(" ")))
    trainer = HiddenMarkovModelTrainer()
    tagger = trainer.train_supervised(pairs)

    print u"-".join(tagger.best_path(u"* я везти сегодня **".split()))
コード例 #14
0
                    sentences.append(sentence)
                sentence = []
            word = [conllu_array[1], conllu_array[3]]
            sentence.append((conllu_array[1], conllu_array[3]))

    return sentences


sentences = tokenize_conllu_file('../en-ud-dev.conllu')
cutoff = int(.9 * len(sentences))
training_sentences = sentences[:cutoff]
test_sentences = sentences[cutoff:]

print('Training Sentences : %d ' % (len(training_sentences)))
print('Testing Sentences : %d ' % (len(test_sentences)))


print 'Training Start'
trainer = HiddenMarkovModelTrainer()
tagger = trainer.train_supervised(training_sentences, estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))
print 'Training Completed'

print 'Testing Start'
tagger.test(test_sentences, verbose=False)
print 'Testing Completed'


import dill
with open('my_tagger.dill', 'wb') as f:
    dill.dump(tagger, f)
コード例 #15
0
def train_hmm_model(labeled_names):
    states = ["O", "C"]
    symbols = list(set([ss[0] for sss in labeled_names for ss in sss]))
    hmm_trainer = HiddenMarkovModelTrainer(states=states, symbols=symbols)
    hmm = hmm_trainer.train_supervised([labeled_names])
    return hmm
コード例 #16
0
    wd_td_ys_bytag = dict()
    wd_vd_ys_bytag = dict()
    td_wd_predictions_by_code = dict()
    vd_wd_predictions_by_code = dict()

    for code in sorted(regular_tags):
        print("Fold %i Training code: %s" % (fold, code))
        td, vd = td_sents_by_code[code], vd_sents_by_code[code]

        hmm_fname = "%s_cv-%i_fold-%i_code-%s_stemed-%s.dill" % (
            hmm_model_prefix, CV_FOLDS, fold, code, str(STEM))
        if os.path.exists(hmm_fname):
            with open(hmm_fname, "rb") as f:
                base_tagger = dill.load(f)
        else:
            hmm_trainer = HiddenMarkovModelTrainer()
            base_tagger = hmm_trainer.train_supervised(td)
            with open(hmm_fname, "wb") as f:
                dill.dump(base_tagger, f)

        #See: http://streamhacker.com/2008/12/03/part-of-speech-tagging-with-nltk-part-3/
        #and http://streamhacker.com/2014/12/02/nltk-3/ for changes to interface

        trainer = BrillTaggerTrainer(base_tagger,
                                     templates,
                                     deterministic=True)
        model = trainer.train(td, max_rules=MAX_RULES, min_score=MIN_SCORE)
        code2model[code] = model

        wd_td_ys_bytag[code] = to_flattened_binary_tags(td)
        wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd)
コード例 #17
0
def inference_dset(model: pl.LightningModule, dset: SignSequenceDataset):
    # list of sequences with elements (pred_class, true_class)
    labeled = []
    for i in trange(len(dset)):
        images, targets, _ = dset[i]
        logits = model(images)

        _, pred = logits.topk(1, dim=1)

        # print(pred)

        predicted_signs = pred[:, 0].tolist()

        # print(predicted_signs)

        target_ints = [int(targ) for targ in targets.tolist()]

        labeled.append(list(zip(predicted_signs, target_ints)))

    return labeled


labeled_train = inference_dset(trained, dset_train)
labeled_val = inference_dset(trained, dset_val)

hmm_trainer = HiddenMarkovModelTrainer()

hmm_tagger = hmm_trainer.train(labeled_sequences=labeled_train)

hmm_tagger.test(labeled_val, verbose=False)
コード例 #18
0
                                                   regular_tags,
                                                   projection=projection)

    code2model = dict()
    fold_models.append(code2model)

    wd_td_ys_bytag = dict()
    wd_vd_ys_bytag = dict()
    td_wd_predictions_by_code = dict()
    vd_wd_predictions_by_code = dict()

    for code in sorted(regular_tags):
        print("Fold %i Training code: %s" % (fold, code))
        td, vd = td_sents_by_code[code], vd_sents_by_code[code]

        trainer = HiddenMarkovModelTrainer()
        model = trainer.train_supervised(td)
        code2model[code] = model

        wd_td_ys_bytag[code] = to_flattened_binary_tags(td)
        wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd)

        td_predictions = model.tag_sents(to_sentences(td))
        vd_predictions = model.tag_sents(to_sentences(vd))

        td_wd_predictions_by_code[code] = to_flattened_binary_tags(
            td_predictions)
        vd_wd_predictions_by_code[code] = to_flattened_binary_tags(
            vd_predictions)

    merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag)
コード例 #19
0
ファイル: backoffCombi.py プロジェクト: Batene/Bamanankan
def backoff_tagger(num_tagger, bambara, option_tones="tonal", option_tag="POS",backoff=defaultTagger):
    """ backoff_tagger of the NLTK cookbook [adapted] """
    taggers = []
    for i in num_tagger:
        if i == 0:
            taggers = taggers + [UnigramTagger]
        if i == 1:
            taggers = taggers + [BigramTagger]
        if i == 2:
            taggers = taggers + [TrigramTagger]
        if i == 3:
            taggers = taggers + [QuadgramTagger]
        if i == 4:
            taggers+=["crf"]
        if i == 5:
            taggers+=["regexp"]
        if i == 6:
            taggers+=["dic"]
        if i == 8:
            taggers+=["affix"]
        if i == 9:
            taggers+=["tnt"]
        if i == 10:
            taggers+=["hmm"]
    #CRF and HMM both do not accept backoff and therefore can only be the last tagger in a backoff chain
    # -> DefaultTagger has to be substituted
    if "hmm" in taggers:
        tag_set= set()
        symbols=set()
        for i in bambara.train_sents:
            for j in i:
                tag_set.add(j[1])
                symbols.add(j[0])
        trainer = HiddenMarkovModelTrainer(list(tag_set), list(symbols))
        hmm = trainer.train_supervised(bambara.train_sents, estimator=lambda fd, bins:LidstoneProbDist(fd, 0.1, bins))
        backoff = hmm
        taggers.remove("hmm")
    if "crf" in taggers:
        backoff = indivCRF(bambara, tone, tag) 
        backoff.train(bambara.train_sents,"model.crfbackoff"+option_tag+option_tones+".tagger")
        backoff.set_model_file("model.crfbackoff"+option_tag+option_tones+".tagger")
        taggers.remove("crf")                                              
    for cls in taggers:
        if cls != "tnt" and cls!="affix" and cls!="regexp" and cls!="dic":
            backoff1 = backoff
            backoff = cls(bambara.train_sents, backoff=backoff1)
            #print(backoff._taggers)
        else:
            if cls == "dic":
                backoff=dictionary_backoff(option_tones, backoff=backoff)
            if cls == "regexp":
                if option_tones == "tonal" and option_tag == "Affixes":
                    backoff=RegexpTonalSA(backoff=backoff)
                if option_tones == "tonal" and option_tag == "POS":
                    backoff=RegexpTonal(backoff=backoff)
                if option_tones == "nontonal" and option_tag == "Affixes":
                    backoff=RegexpSA(backoff=backoff)
                if option_tones == "nontonal" and option_tag == "POS":
                    backoff=Regexp(backoff=backoff)
            if cls == "affix":
                backoff = AffixTagger(bambara.train_sents, min_stem_length=0, affix_length=-4, backoff = backoff)
            if cls == "tnt":
                backoff = tnt.TnT(unk=backoff, Trained= True, N=100)
                backoff.train(bambara.train_sents)
    return backoff
コード例 #20
0
test_tagged_corpus = []
for s, st in zip(test_cipher, test_plain):
    sample = list(zip(s, st))
    test_tagged_corpus.append(sample)

if args['laplace'] == True:
    Estimator = LaplaceProbDist
    print_estimator = 'Laplace'  # just for printing
else:
    Estimator = MLEProbDist
    print_estimator = 'MLE'  # just for printing
#/////////////// Train test MLE and la place etimsator /////////////////

# training
HMM_tagger = HiddenMarkovModelTrainer(states=States, symbols=Symbols)
HMM_tagger = HMM_tagger.train_supervised(train_tagged_corpus,
                                         estimator=Estimator)
print(HMM_tagger)

#/////////////////////// TEXT IMPROVEMENT  /////////////////////////////

if args['lm'] == True:
    # get additional text
    # Text number 2554 English translation of Crime and Punishment
    bigrams = get_bigram(train_plain,
                         url='http://www.gutenberg.org/files/2554/2554-0.txt')
    # conditional freq dist
    cfd = ConditionalFreqDist(bigrams)
    # Conditional probability distribution
    cpd = nltk.ConditionalProbDist(cfd, Estimator)
コード例 #21
0
        except:
            print(i,j)
        if HMM._tag_to_index[pred_ys[j]] == HMM._tag_to_index[Ys[i][j]]:
            c+=1
        total+=1

# print(confuse.shape)
# df = pd.DataFrame(confuse)
# # columns=list(self._tag_to_index.keys()),index=list(self._tag_to_index.keys())
# sn.heatmap(df)
# plt.show()
# print(confuse)
A = sum([confuse[i][i] for i in range(len(HMM._index_to_tag))])/total
from nltk.tag.hmm import HiddenMarkovModelTrainer

trainer = HiddenMarkovModelTrainer()
tagger = trainer.train_supervised(data)

c=0
total = 0
# print(len(tagger._symbols))
# print(len(HMM._index_to_word))
# print(len(tagger._states))
# print(len(HMM._index_to_tag))
print([k[1] for k in HMM.predict(Xs[27])])
print([k[1] for k in tagger.tag(Xs[27])])
print(Ys[27])
for i in range(len(Xs)):
    pred_ys = [k[1] for k in tagger.tag(Xs[i])]
    for j in range(len(Xs[i])):
        # confuse[tagger._tag_to_index[pred_ys[j]]][tagger._tag_to_index[Ys[i][j]]]+=1
コード例 #22
0
    wd_td_ys_bytag = dict()
    wd_vd_ys_bytag = dict()
    td_wd_predictions_by_code = dict()
    vd_wd_predictions_by_code = dict()

    for code in sorted(regular_tags):
        print("Fold %i Training code: %s" % (fold, code))
        td, vd = td_sents_by_code[code], vd_sents_by_code[code]

        hmm_fname = "%s_cv-%i_fold-%i_code-%s_stemed-%s.dill" % (hmm_model_prefix, CV_FOLDS, fold, code, str(STEM))
        if os.path.exists(hmm_fname):
            with open(hmm_fname, "rb") as f:
                base_tagger = dill.load(f)
        else:
            hmm_trainer = HiddenMarkovModelTrainer()
            base_tagger = hmm_trainer.train_supervised(td)
            with open(hmm_fname, "wb") as f:
                dill.dump(base_tagger, f)

        #See: http://streamhacker.com/2008/12/03/part-of-speech-tagging-with-nltk-part-3/
        #and http://streamhacker.com/2014/12/02/nltk-3/ for changes to interface

        trainer = BrillTaggerTrainer(base_tagger, templates, deterministic=True)
        model = trainer.train(td, max_rules=MAX_RULES, min_score=MIN_SCORE)
        code2model[code] = model

        wd_td_ys_bytag[code] = to_flattened_binary_tags(td)
        wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd)

        td_predictions = model.tag_sents(to_sentences(td))
コード例 #23
0
            for a4 in range(4):
                #weights = [possible_weigths[random.randint(0,12)],possible_weigths[random.randint(0,12)],possible_weigths[
                #    random.randint(0,12)],possible_weigths[random.randint(0,12)],possible_weigths[random.randint(0,12)],
                #   #        possible_weigths[random.randint(0,12)],possible_weigths[random.randint(0,12)],possible_weigths[
                #               random.randint(0,12)],possible_weigths[random.randint(0,12)],possible_weigths[random.randint(0,12)]]

                weights = [4, 15, 3, 0.5, 1, 0.1, 3, 2, 5, 0]
                kernel = possible_kernels[a1]
                degree = possible_degree[a2]
                epsilon = possible_epsilon[a3]
                C = possible_C[a4]

                n_components_LDA = 3

                # HMM for feature 9
                trainer = HiddenMarkovModelTrainer()
                st = 3000
                train_data = treebank.tagged_sents()[:st]
                HMM = trainer.train_supervised(train_data)


                # Read training examples and training labels
                N_features = len(weights)
                frases1_train, frases2_train, Y_train = read_training_datasets()
                N_instances_train = len(Y_train)

                # Compute features (X_train)
                X_train = np.zeros((N_instances_train,1))
                X_train = compute_feature1(frases1_train, frases2_train, X_train)
                X_train = compute_feature2(frases1_train, frases2_train, X_train)
                X_train = compute_feature3(frases1_train, frases2_train, X_train)