コード例 #1
0
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold):

    projection = lambda x: x
    if STEM:
        projection = stem

    # Start Training
    print("Fold %i Training code" % fold)

    # For training
    td_sents = to_label_powerset_tagged_sentences(essays_TD, regular_tags, projection=projection)
    vd_sents = to_label_powerset_tagged_sentences(essays_VD, regular_tags, projection=projection)

    trainer = HiddenMarkovModelTrainer()
    model = trainer.train_supervised(td_sents)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents, regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents, regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
コード例 #2
0
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold):

    projection = lambda x: x
    if STEM:
        projection = stem

    # Start Training
    print("Fold %i Training code" % fold)

    # Important - only compute code frequency from training data (NO CHEATING)
    code_freq = tally_code_frequencies(essays_TD)

    # For training
    td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq, projection=projection)
    vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq, projection=projection)

    trainer = HiddenMarkovModelTrainer()
    model = trainer.train_supervised(td_sents)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags)
    vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags)

    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_hmm(train_set, observations, index_features):
    '''Training the hmms...'''
    # symbols is a vector of inputs, each input is a vector of features (requires to be tuples by nltk)
    # we don't need to specify the states so we choose 1 and 2
    trainer = HiddenMarkovModelTrainer(states=[1, 2], symbols=observations)
    hmms = {}
    for cat in train_set.keys():
        print "Training HMM of cat:", cat
        tuple_sentences = []
        for sentence in train_set[cat]:
            '''tuple_sentence = [(tuple(word),'') for word in sentence]
            tuple_sentences.append(tuple_sentence)
            '''
            '''feature subset selection'''
            new_sentence = []
            for word in sentence:
                new_word = []
                for feature in index_features:
                    new_word.append(word[feature])
                new_sentence.append(new_word)
            tuple_sentence = [(tuple(word), '') for word in new_sentence]
            tuple_sentences.append(tuple_sentence)

            # sentence is a list of list! so w is a list of feature not only a word!
        hmms[cat] = trainer.train_unsupervised(tuple_sentences,
                                               max_iterations=10)
    return hmms
コード例 #4
0
def train(train_set, word_types, tag_set):
    """
    Training...
    Called this way, the HMM knows the whole set of tags and the whole set of words (no "unknown" word and/or tag during test)
    """
    trainer = HiddenMarkovModelTrainer(list(tag_set), list(
        word_types))  # tag_set and word_types are sets: I need to create lists
    # GoodTuring smoothing
    # see: https://nltk.googlecode.com/svn/trunk/doc/api/nltk.probability.SimpleGoodTuringProbDist-class.html
    #      http://en.wikipedia.org/wiki/Additive_smoothing
    hmm = trainer.train_supervised(
        train_set,
        estimator=lambda fd, bins: SimpleGoodTuringProbDist(fd, bins))
    return hmm
コード例 #5
0
    def train(self, labeled_sequence):
        def estimator(fd, bins):
            return LidstoneProbDist(fd, 0.1, bins)

        labeled_sequence = LazyMap(_identity, labeled_sequence)
        symbols = unique_list(word for sent in labeled_sequence for word, tag in sent)
        tag_set = unique_list(tag for sent in labeled_sequence for word, tag in sent)

        trainer = HiddenMarkovModelTrainer(tag_set, symbols)
        hmm = trainer.train_supervised(labeled_sequence, estimator=estimator)
        hmm = HiddenMarkovModelTagger(
            hmm._symbols,
            hmm._states,
            hmm._transitions,
            hmm._outputs,
            hmm._priors,
            transform=_identity,
        )
        self.tagger = hmm
コード例 #6
0
def inference_dset(model: pl.LightningModule, dset: SignSequenceDataset):
    # list of sequences with elements (pred_class, true_class)
    labeled = []
    for i in trange(len(dset)):
        images, targets, _ = dset[i]
        logits = model(images)

        _, pred = logits.topk(1, dim=1)

        # print(pred)

        predicted_signs = pred[:, 0].tolist()

        # print(predicted_signs)

        target_ints = [int(targ) for targ in targets.tolist()]

        labeled.append(list(zip(predicted_signs, target_ints)))

    return labeled


labeled_train = inference_dset(trained, dset_train)
labeled_val = inference_dset(trained, dset_val)

hmm_trainer = HiddenMarkovModelTrainer()

hmm_tagger = hmm_trainer.train(labeled_sequences=labeled_train)

hmm_tagger.test(labeled_val, verbose=False)
コード例 #7
0
test_tagged_corpus = []
for s, st in zip(test_cipher, test_plain):
    sample = list(zip(s, st))
    test_tagged_corpus.append(sample)

if args['laplace'] == True:
    Estimator = LaplaceProbDist
    print_estimator = 'Laplace'  # just for printing
else:
    Estimator = MLEProbDist
    print_estimator = 'MLE'  # just for printing
#/////////////// Train test MLE and la place etimsator /////////////////

# training
HMM_tagger = HiddenMarkovModelTrainer(states=States, symbols=Symbols)
HMM_tagger = HMM_tagger.train_supervised(train_tagged_corpus,
                                         estimator=Estimator)
print(HMM_tagger)

#/////////////////////// TEXT IMPROVEMENT  /////////////////////////////

if args['lm'] == True:
    # get additional text
    # Text number 2554 English translation of Crime and Punishment
    bigrams = get_bigram(train_plain,
                         url='http://www.gutenberg.org/files/2554/2554-0.txt')
    # conditional freq dist
    cfd = ConditionalFreqDist(bigrams)
    # Conditional probability distribution
    cpd = nltk.ConditionalProbDist(cfd, Estimator)
C1_sequences = []
C2_sequences = []
C3_sequences = []

for i in range(len(C1_train_cipher)):
        C1_sequences.append(zip(C1_train_cipher[i],C1_train_plain[i]))
        
for i in range(len(C2_train_cipher)):
        C2_sequences.append(zip(C2_train_cipher[i],C2_train_plain[i]))
        
for i in range(len(C3_train_cipher)):
        C3_sequences.append(zip(C3_train_cipher[i],C3_train_plain[i]))        
        

trainer = HiddenMarkovModelTrainer(symbols,states)
print("################## Analysis of Ciphers without improved Plaintext modelling ####################### \n")

if(laplace_mode):
    print("################## Laplace ####################### \n")
    C1_tagger = trainer.train_supervised(C1_sequences, estimator= nltk.probability.LaplaceProbDist)
    C2_tagger = trainer.train_supervised(C2_sequences, estimator= nltk.probability.LaplaceProbDist)
    C3_tagger = trainer.train_supervised(C3_sequences, estimator= nltk.probability.LaplaceProbDist)
else:
    C1_tagger = trainer.train_supervised(C1_sequences)
    C2_tagger = trainer.train_supervised(C2_sequences)
    C3_tagger = trainer.train_supervised(C3_sequences)

C1_tester = []
C2_tester = []
C3_tester = []