def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold): projection = lambda x: x if STEM: projection = stem # Start Training print("Fold %i Training code" % fold) # For training td_sents = to_label_powerset_tagged_sentences(essays_TD, regular_tags, projection=projection) vd_sents = to_label_powerset_tagged_sentences(essays_VD, regular_tags, projection=projection) trainer = HiddenMarkovModelTrainer() model = trainer.train_supervised(td_sents) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold): projection = lambda x: x if STEM: projection = stem # Start Training print("Fold %i Training code" % fold) # Important - only compute code frequency from training data (NO CHEATING) code_freq = tally_code_frequencies(essays_TD) # For training td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq, projection=projection) vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq, projection=projection) trainer = HiddenMarkovModelTrainer() model = trainer.train_supervised(td_sents) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags) vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_hmm(train_set, observations, index_features): '''Training the hmms...''' # symbols is a vector of inputs, each input is a vector of features (requires to be tuples by nltk) # we don't need to specify the states so we choose 1 and 2 trainer = HiddenMarkovModelTrainer(states=[1, 2], symbols=observations) hmms = {} for cat in train_set.keys(): print "Training HMM of cat:", cat tuple_sentences = [] for sentence in train_set[cat]: '''tuple_sentence = [(tuple(word),'') for word in sentence] tuple_sentences.append(tuple_sentence) ''' '''feature subset selection''' new_sentence = [] for word in sentence: new_word = [] for feature in index_features: new_word.append(word[feature]) new_sentence.append(new_word) tuple_sentence = [(tuple(word), '') for word in new_sentence] tuple_sentences.append(tuple_sentence) # sentence is a list of list! so w is a list of feature not only a word! hmms[cat] = trainer.train_unsupervised(tuple_sentences, max_iterations=10) return hmms
def train(train_set, word_types, tag_set): """ Training... Called this way, the HMM knows the whole set of tags and the whole set of words (no "unknown" word and/or tag during test) """ trainer = HiddenMarkovModelTrainer(list(tag_set), list( word_types)) # tag_set and word_types are sets: I need to create lists # GoodTuring smoothing # see: https://nltk.googlecode.com/svn/trunk/doc/api/nltk.probability.SimpleGoodTuringProbDist-class.html # http://en.wikipedia.org/wiki/Additive_smoothing hmm = trainer.train_supervised( train_set, estimator=lambda fd, bins: SimpleGoodTuringProbDist(fd, bins)) return hmm
def train(self, labeled_sequence): def estimator(fd, bins): return LidstoneProbDist(fd, 0.1, bins) labeled_sequence = LazyMap(_identity, labeled_sequence) symbols = unique_list(word for sent in labeled_sequence for word, tag in sent) tag_set = unique_list(tag for sent in labeled_sequence for word, tag in sent) trainer = HiddenMarkovModelTrainer(tag_set, symbols) hmm = trainer.train_supervised(labeled_sequence, estimator=estimator) hmm = HiddenMarkovModelTagger( hmm._symbols, hmm._states, hmm._transitions, hmm._outputs, hmm._priors, transform=_identity, ) self.tagger = hmm
def inference_dset(model: pl.LightningModule, dset: SignSequenceDataset): # list of sequences with elements (pred_class, true_class) labeled = [] for i in trange(len(dset)): images, targets, _ = dset[i] logits = model(images) _, pred = logits.topk(1, dim=1) # print(pred) predicted_signs = pred[:, 0].tolist() # print(predicted_signs) target_ints = [int(targ) for targ in targets.tolist()] labeled.append(list(zip(predicted_signs, target_ints))) return labeled labeled_train = inference_dset(trained, dset_train) labeled_val = inference_dset(trained, dset_val) hmm_trainer = HiddenMarkovModelTrainer() hmm_tagger = hmm_trainer.train(labeled_sequences=labeled_train) hmm_tagger.test(labeled_val, verbose=False)
test_tagged_corpus = [] for s, st in zip(test_cipher, test_plain): sample = list(zip(s, st)) test_tagged_corpus.append(sample) if args['laplace'] == True: Estimator = LaplaceProbDist print_estimator = 'Laplace' # just for printing else: Estimator = MLEProbDist print_estimator = 'MLE' # just for printing #/////////////// Train test MLE and la place etimsator ///////////////// # training HMM_tagger = HiddenMarkovModelTrainer(states=States, symbols=Symbols) HMM_tagger = HMM_tagger.train_supervised(train_tagged_corpus, estimator=Estimator) print(HMM_tagger) #/////////////////////// TEXT IMPROVEMENT ///////////////////////////// if args['lm'] == True: # get additional text # Text number 2554 English translation of Crime and Punishment bigrams = get_bigram(train_plain, url='http://www.gutenberg.org/files/2554/2554-0.txt') # conditional freq dist cfd = ConditionalFreqDist(bigrams) # Conditional probability distribution cpd = nltk.ConditionalProbDist(cfd, Estimator)
C1_sequences = [] C2_sequences = [] C3_sequences = [] for i in range(len(C1_train_cipher)): C1_sequences.append(zip(C1_train_cipher[i],C1_train_plain[i])) for i in range(len(C2_train_cipher)): C2_sequences.append(zip(C2_train_cipher[i],C2_train_plain[i])) for i in range(len(C3_train_cipher)): C3_sequences.append(zip(C3_train_cipher[i],C3_train_plain[i])) trainer = HiddenMarkovModelTrainer(symbols,states) print("################## Analysis of Ciphers without improved Plaintext modelling ####################### \n") if(laplace_mode): print("################## Laplace ####################### \n") C1_tagger = trainer.train_supervised(C1_sequences, estimator= nltk.probability.LaplaceProbDist) C2_tagger = trainer.train_supervised(C2_sequences, estimator= nltk.probability.LaplaceProbDist) C3_tagger = trainer.train_supervised(C3_sequences, estimator= nltk.probability.LaplaceProbDist) else: C1_tagger = trainer.train_supervised(C1_sequences) C2_tagger = trainer.train_supervised(C2_sequences) C3_tagger = trainer.train_supervised(C3_sequences) C1_tester = [] C2_tester = [] C3_tester = []