Example #1
0
def train(cipher_train_data, plain_train_data,
          cipher_test_data, plain_test_data, laplace_smoothing=False):

    train_data = prepare_data(cipher_train_data, plain_train_data)

    trainer = hmm.HiddenMarkovModelTrainer(states=_ALPHABET, symbols=_ALPHABET)
    # Trainer uses MLE by default
    if laplace_smoothing:
        tagger = trainer.train_supervised(train_data, estimator=LaplaceProbDist)
    else:
        tagger = trainer.train_supervised(train_data)

    correct = 0
    total = 0
    # Test model
    for s_cipher, s_plain in zip(cipher_test_data, plain_test_data):
        cipher, decoded = list(zip(*tagger.tag([c for c in s_cipher])))
        decoded = "".join(decoded)
        cipher = "".join(cipher)
        print("\n")
        print(f"Cipher - {cipher}")
        print(f"Plain - {s_plain}")
        print(f"Prediction - {decoded}")

        for c_decoded, c_plain in zip(decoded, s_plain):
            if c_decoded == c_plain:
                correct += 1
        total += len(decoded)

    print(f"\n>>> Accuracy {correct/total}")
Example #2
0
def HMM(data, symbols, tag_set, verbose=True):
    '''
    NB(data,symbols,tag_set,verbose)->model,prediction,report(dict). 
    Keyword arguments:
        data: see preprocessing.py
        symbols: list of the input class labels
        tag_set: list of the output class labels
    for data structure see preprocessing.py
    '''
    trainer = hmm.HiddenMarkovModelTrainer(tag_set, symbols)
    tagger = trainer.train_supervised(
        data.y_train,
        estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins),
    )

    y_pred = []
    for sentence in data.x_test:
        y_pred.append(tagger.tag(sentence))

    #unlike the test or evaluate function from the same suit, this requires
    #a list of symbols, not tuples of symbols and tags
    y_pred = [[tup[1] for tup in sentence] for sentence in y_pred]

    print('HMM Results:')
    print(gen_rep_flat(data, y_pred, False))
    return tagger, y_pred, gen_rep_flat(data, y_pred, True)
Example #3
0
def ie_preprocess(document):
    print document
    sentences = nltk.sent_tokenize(document)
    # print sentences
    trigram_tagger = nltk.TrigramTagger(brown_a, cutoff=0)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    print "\nDefault tagger"
    x = [t0.tag(sent) for sent in sentences]
    print x
    print "\nUnigram tagger"
    x = [t1.tag(sent) for sent in sentences]
    print x
    print "\nBigram tagger"
    x = [t2.tag(sent) for sent in sentences]
    print x
    print "\nTrigram tagger"
    x = [t3.tag(sent) for sent in sentences]
    print x
    print "\n"
    # sentences = [nltk.pos_tag(sent) for sent in sentences
    trainer = hmm.HiddenMarkovModelTrainer()
    train_data = treebank.tagged_sents()[:3000]
    tagger = trainer.train_supervised(train_data)
    print tagger
    print "\nHMM tagger"
    x = [tagger.tag(sent) for sent in sentences]
    print x
    print "\nPOS Tag"
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    print sentences
    return sentences
Example #4
0
 def __init__(self):
     """
     Constructor.
     """
     super().__init__()
     self.trainer = hmm.HiddenMarkovModelTrainer()
     self.tagger = None
Example #5
0
def main():
    # load corpus
    tokens, labels, sentences = read_corpus()
    trainer = h.HiddenMarkovModelTrainer(labels, tokens)

    # load model
    hmm = None
    try:
        model = open("hmm_pretrain.pkl", 'rb')
        hmm = pickle.load(model)
        model.close()
    except:
        pass

    # training model
    hmm = trainer.train_unsupervised(list(sentences),
                                     max_iterations=2,
                                     model=hmm)

    # save model
    model = open("hmm_pretrain.pkl", 'wb')
    pickle.dump(hmm, model)
    model.close()

    # test the trained model
    hmm.test(list(sentences[:10]), verbose=True)
Example #6
0
 def __init__(self, train_sents, sent_dict):
     '''train_sents entries are in form [((w, pos_tag), iob_tag),...]
     '''
     train_set = []
     tag_set = []
     symbols = []
     self.stemmer = LancasterStemmer()
     self.just_pos = False
     self.use_pos = False
     for tagged_sent in train_sents:
         example = []
         for i, (wd_pos, tag) in enumerate(tagged_sent):
             tag_set.append(tag)
             if self.just_pos:
                 symb = wd_pos[1]
             elif self.use_pos:
                 #symb = wd_pos[0]+wd_pos[1]
                 symb = self.stemmer.stem(wd_pos[0]) + wd_pos[1]
             else:
                 symb = self.stemmer.stem(wd_pos[0])
             symbols.append(symb)
             example.append((symb, tag))
         train_set.append(example)
     trainer = hmm.HiddenMarkovModelTrainer(list(set(tag_set)),
                                            list(set(symbols)))
     self.hmm = trainer.train_supervised(train_set)
Example #7
0
def trainModel(train_sent, laplace, symbols):
    if laplace:
        estimator = LaplaceProbDist
    else:
        estimator = MLEProbDist

    trainer = hmm.HiddenMarkovModelTrainer(symbols=symbols)
    model = trainer.train(labeled_sequences=train_sent, estimator=estimator)
    return model
Example #8
0
def hmm_base(path):
    train_corpus = load_files(p=path)
    test_corpus = load_files(p=path, mode="test")
    trainer = hmm.HiddenMarkovModelTrainer()
    tagger = trainer.train_supervised(train_corpus)
    res = tagger.evaluate(test_corpus)

    # accruacy
    print("test accruacy {}".format(res))
Example #9
0
def hmm_tagging(train, test):
    """
    Train a HMM for prediction.
    Did not work so well.
    Especially when not using the eplacement method for really rare words. (Freq <=2)
    :param train: Annotated training data
    :param test: Annotated test data for eval
    :return:
    """
    trainer = hmm.HiddenMarkovModelTrainer()
    tagger = trainer.train_supervised(train)
    print(tagger.evaluate(test))
Example #10
0
def validacion_cruzada(classificador,train_data,k):
    kf = KFold(k,shuffle=True)
    scores=[]
  
    for i,j in kf.split(train_data):
        if type(classificador) == type(hmm.HiddenMarkovModelTrainer()):
            model = classificador.train_supervised(train_data[i[0]:i[-1]])
            scores.append(model.evaluate(train_data[j[0]:j[-1]]))
        else:
            classificador.train(train_data[i[0]:i[-1]])
            scores.append(classificador.evaluate(train_data[j[0]:j[-1]]))
    
    return scores
Example #11
0
def hmm_laplace(path):
    train_corpus = load_files(p=path)
    test_corpus = load_files(p=path, mode="test")

    def est(fd, bins):
        return LidstoneProbDist(fd, 1, bins)

    trainer = hmm.HiddenMarkovModelTrainer()
    tagger = trainer.train_supervised(train_corpus, estimator=est)
    # print(test_corpus[0])
    res = tagger.evaluate(test_corpus)

    # accruacy
    print("test accruacy {}".format(res))
Example #12
0
    def train(self, file):
        """
        Trains the Diacritic Restorer on the training set from the given file using the HMM of n-grams.

        :param file: path to file with training set (sentences with diacritics, ideally detokenized)
        :type file: str
        :return: self for further use
        :rtype: BaseDiacriticsRestorer
        """
        buffer = CorpusNgramBuffer(file, self.n)
        self.tagger = hmm.HiddenMarkovModelTrainer().train_supervised(buffer)
        buffer.close()
        milestone("training done: ")
        return self
Example #13
0
def test():
    files = loadFiles(sys.argv[1])
    labeled_data = label(files["train_data"].read(), files["train_txt"].read())
    trainer = hmm.HiddenMarkovModelTrainer()
    if len(sys.argv) > 2:
        if sys.argv[2] == "-laplace":
            tagger = trainer.train_supervised(labeled_data, LaplaceProbDist)
    else:
        tagger = trainer.train_supervised(labeled_data)
    test_data = testPrep(files["test_data"].read())
    comparison = testPrep(files["test_txt"].read())
    results = 0
    for element in test_data:
        output = tagger.tag(element)
        results += accuracy(output, comparison[0])
        comparison.pop(0)
    return results / len(test_data)
Example #14
0
    def fit(self, data):
        """
        Fits a tagging model to object's data based on object's tagger name
        :return: a tagger object
        """
        tagger = None
        self.X = data

        if self.tagger_type == 'hmm':
            # Setup a trainer with default(None) values
            # And train with the data
            trainer = hmm.HiddenMarkovModelTrainer()
            tagger = trainer.train_supervised(data)

        elif self.tagger_type == 'crf':
            trainer = CRFTagger()
            trainer.train(self.train_data, 'model.crf.tagger')
            tagger = trainer

        self.tagger = tagger

        return tagger
Example #15
0
def ie_preprocess(data):
    trainer = hmm.HiddenMarkovModelTrainer()
    tagger = trainer.train_supervised(train_data)
    print tagger
    return tagger.tag(data.split())
Example #16
0
train_data = treebank.tagged_sents()[:3900]

# # Carga del modelo HMM previamente entrenado
#
# Estructura de la data de entrenamiento. Tener presente que la convención es diferente de la UPOS, ya que el Dataset es antiguo y por ende tiene otra convención. El algoritmo funciona con cualquier convención.

# In[ ]:

train_data

# HMM pre-construido en NLTK

# In[ ]:

from nltk.tag import hmm
tagger = hmm.HiddenMarkovModelTrainer().train_supervised(train_data)
tagger

# In[ ]:

tagger.tag("Pierre Vinken will get old".split())

# Training accuracy

# In[ ]:

tagger.evaluate(treebank.tagged_sents()[:3900])

# ## Ejercicio de práctica
#
# **Objetivo:** Entrena un HMM usando la clase `hmm.HiddenMarkovModelTrainer()` sobre el dataset `UD_Spanish_AnCora`.
Example #17
0
                    tag_prevtag = key + '|'+ key2
                    if tag_prevtag in transitionProbdict.keys():
                        if viterbiProb[tag_row2, col-1]>0:
                            possible_probs.append(viterbiProb[tag_row2, col-1]*transitionProbdict[tag_prevtag]*emissionProbdict[word_tag])
                ##Escoger ahora el maximo de los elementos por columna
                viterbiProb[tag_row, col] = max(possible_probs)
    
    ##Construccion de la secuencia de etiquetas (palabra y tag)
    res = []
    for i, p in enumerate(seq):
        for tag in tagStateDict.keys():
            if tagStateDict[tag] == np.argmax(viterbiProb[:, i]):
                res.append((p, tag))
    return res

vector = ViterbiTags('el mundo es pequeño')
#print(vector)


##Ahora se peude hacer un entrenamiento directo con NLTK

train_data = treebank.tagged_sents()[:3900]
#print(train_data)

tagger = hmm.HiddenMarkovModelTrainer().train_supervised(train_data)

test = tagger.tag('Pierre Vinken will get old'.split())
print(test)

check = tagger.evaluate(train_data)
print(check)
    y = train_y_characters[i]
    train_data.append((x, y))

train_data = [train_data]

for i in range(0, len(test_x_characters)):
    x = test_x_characters[i]
    y = test_y_characters[i]
    test_data.append((x, y))

test_data = [test_data]
##without segemetation

##the processed 3rd cipher can not use this line
states = symbols = x_list
hmm_trainer = hmm.HiddenMarkovModelTrainer(states, symbols)
##

##Laplace Smoothing
if (laplace_improved == True):
    hmm_model = hmm_trainer.train_supervised(train_data,
                                             estimator=pb.LaplaceProbDist)
else:
    hmm_model = hmm_trainer.train_supervised(train_data)
##

##try different functions
result = hmm_model.best_path(test_x_characters)
result_acc = hmm_model.evaluate(test_data)
##
results = ('').join(result)
Example #19
0
 def train_tagger(self, train_data):
     """ Trains an hmm pos tagger"""
     self.trainer = hmm.HiddenMarkovModelTrainer()
     self.tagger = self.trainer.train_supervised(train_data)
Example #20
0
 def __init__(self, labeled_sequences, substitute):
     self.substitute = substitute
     trainer = hmm.HiddenMarkovModelTrainer()
     self.my_tagger = trainer.train_supervised(self.process_lines(labeled_sequences))
Example #21
0
def main():

    file_path = r'fi-ud-train.pos-tagged.txt'

    tagged_sents = read_tagged_sents(file_path)
    random.shuffle(tagged_sents)

    # Copy the 5 first sentences so that the words that will be replaced with '<UNK>' can
    # be used when printing
    ref_sents = copy.deepcopy(tagged_sents[:5])

    size = int(len(tagged_sents) * 0.1)
    train_set, test_set = tagged_sents[size:], tagged_sents[:size]

    # Make 2 list variables that consist only of (w,t) tuples so the word amount is easier to count
    train_set_words = list(itertools.chain.from_iterable(train_set))
    test_set_words = list(itertools.chain.from_iterable(test_set))

    # Frequencies of words in the train_set
    train_set_wt_freqs = Counter(train_set_words)

    # Go through train_set and change words with frequencies below 3 to '<UNK>'
    for i, sent in enumerate(train_set):
        for j, (word, tag) in enumerate(sent):
            if train_set_wt_freqs[(word, tag)] < 3:
                sent[j] = ('<UNK>', tag)

        if i > 500:  # For the 500 first sentences
            break

    unk_words = []

    # Go through test_set and change words that don't appear in the train_set into '<UNK>'
    for sent in test_set:
        for i, (word, tag) in enumerate(sent):
            if (word, tag) not in train_set_words:
                unk_words.append((word, tag))
                sent[i] = ('<UNK>', tag)

    UNK_rel_freq = len(unk_words) / len(test_set_words)
    print("Relative frequency of unknown words in the test set: {}\n".format(
        UNK_rel_freq))

    trainer = hmm.HiddenMarkovModelTrainer()
    tagger = trainer.train_supervised(train_set)

    print("HMM based POS tagger's accuracy: {}".format(
        tagger.evaluate(test_set)))

    # List of the 5 first sentences in the test_set
    print_sents = [[word for word, tag in tagged_sent]
                   for tagged_sent in tagged_sents[:5]]

    print('\n\n5 sentences tagged by the ConsecutivePosTagger:\n')
    for i, sent in enumerate(print_sents):
        print("Sentence {}:".format(i + 1))
        tagged_sent = tagger.tag(sent)

        # Add the actual word in front of the possible '<UNK>'
        for j, (word, tag) in enumerate(tagged_sent):
            ref_word = ref_sents[i][j][0]
            tagged_sent[j] = (re.sub(r'(<UNK>)', ref_word + r'\1', word), tag)

        print(tagged_sent, "\n")
Example #22
0
def main():
    parser = argparse.ArgumentParser(description='Text decipher options')
    parser.add_argument('cipher_folder', help='cipher data folder')
    parser.add_argument('--laplace',
                        '-laplace',
                        action='store_true',
                        default=False,
                        help='Laplace Smoothing')
    parser.add_argument('--langmod',
                        '-lm',
                        action='store_true',
                        default=False,
                        help='Improved decoder')

    args = parser.parse_args()
    cipher_folder = args.cipher_folder
    laplace = args.laplace
    langmod = args.langmod
    number_of_supp_lines = 100  #the more lines the slower the code!

    train_data, test_data, train_plain = get_data(cipher_folder)
    preprocess_supp_data()
    supp_data = read_preprocessed_supp_data(number_of_supp_lines)
    for line in train_plain:  #this is so later we have all the transitions in the same place
        supp_data.extend(list(line))

    if laplace:
        smoothing = LaplaceProbDist
    else:
        smoothing = MLEProbDist

    trainer = hmm.HiddenMarkovModelTrainer()
    decoder = trainer.train_supervised(train_data, smoothing)

    #decoder_supp = trainer_supp.train_unsupervised(supp_data, update_outputs=False, model=decoder)
    #because there's a bug in train_unsupervised (although I found out how to fix it!), I will have to do this manually....
    #code copied from the nltk train_supervised method
    #here, we are updating the transition data to include our supplemental data
    if langmod:
        states = decoder._states
        symbols = decoder._symbols
        outputs = decoder._outputs
        priors = decoder._priors
        starting = FreqDist()  #declaring
        transitions = ConditionalFreqDist(
        )  #declaring, why we needed all the transitions in the same place
        for item in supp_data:
            for sequence in supp_data:
                lasts = None
                for state in sequence:
                    if lasts is None:
                        starting[state] += 1
                    else:
                        transitions[lasts][state] += 1
                    lasts = state

        if laplace:
            estimator = LaplaceProbDist
        else:
            estimator = lambda fdist, bins: MLEProbDist(
                fdist)  #getting this straight from the source code

        N = len(states)
        pi = estimator(starting, N)
        A = ConditionalProbDist(transitions, estimator, N)
        #conditionalPD is actually already defined by our previously trained model as outputs.
        #we don't have new ones!
        decoder = HiddenMarkovModelTagger(symbols, states, A, outputs, pi)

    print(decoder.test(test_data))
    for sent in test_data:
        print "".join([y[1] for y in decoder.tag([x[0] for x in sent])])
Example #23
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-laplace',
                        help="adds laplace smoothing",
                        action="store_true")
    parser.add_argument(
        '-lm',
        help="informs the character transitions in english using extra-text",
        action='store_true')
    parser.add_argument('cipher',
                        help='select the cipher that you want to decode',
                        type=str)
    args = parser.parse_args()

    path_to_directory = os.path.abspath(os.path.curdir)
    path_to_cipher = os.path.join(path_to_directory, args.cipher)

    if args.laplace:
        estimation_method = LaplaceProbDist
    else:
        estimation_method = MLEProbDist

    cipher_test, cipher_train, plaintext_test, plaintext_train = read_in_ciphers(
        path_to_cipher)

    training_set = []
    for i in range(len(cipher_train)):
        training_units = turn_training_observation_into_nltk_format(
            cipher_train[i], plaintext_train[i])
        training_set.append(training_units)

    hidden_markov_trainer = hmm.HiddenMarkovModelTrainer()
    tagger = hidden_markov_trainer.train_supervised(
        training_set, estimator=estimation_method)

    if args.lm:
        with open('frankenstein_ulysses_hrtofdarkness.txt', 'rb') as f:
            extra_text = f.read()
            extra_text = str(extra_text)

        extra_text = extra_text.replace(r'\r', '')
        extra_text = extra_text.replace(r'\n', '')
        extra_text = extra_text.replace(r'\x', '')

        extra_text = clean_additional_text(extra_text)

        additional_text_transitions = find_transition_frequency(extra_text)
        original_text_transitions = find_transition_frequency(
            ''.join(plaintext_train))

        combined_transition_frequency = additional_text_transitions.__add__(
            original_text_transitions)

        tagger._transitions = ConditionalProbDist(
            combined_transition_frequency, estimation_method,
            len(combined_transition_frequency.keys()))

    test_set = turn_test_cipher_into_nltk_format(cipher_test)

    predictions = [tagger.tag(test_sentence) for test_sentence in test_set]
    predicted_sequence = extract_predicted_sequence(predictions)
    recomposed_sentences = [
        ''.join(sentence) for sentence in predicted_sequence
    ]

    print('\n')
    print('These sentences were decoded using the hidden markov model: \n')

    for sentence in recomposed_sentences:
        print(sentence)

    print('\n')

    whole_text_accuracy = find_total_accuracy(recomposed_sentences,
                                              plaintext_test)
    print('The accuracy for the whole text was %s' % whole_text_accuracy)
Example #24
0
    plain_path = cipher_folder + '/train_plain.txt'
    cipher_path = cipher_folder + '/train_cipher.txt'
    cipher_train = get_text(cipher_path)
    plain_train = get_text(plain_path)
    # format the training data
    train_data = format_data(cipher_train, plain_train)

    # test data
    testc_path = cipher_folder + '/test_cipher.txt'
    testp_path = cipher_folder + '/test_plain.txt'
    testc = get_text(testc_path)
    testp = get_text(testp_path)
    # format the test data
    test_data = format_data(testc, testp)

    trainer = hmm.HiddenMarkovModelTrainer()

    #laplace estimator
    my_estimator = lambda fdist, bins: LaplaceProbDist(fdist, bins)

    if args.laplace_smoothing:
        if args.supplement:
            tagger = train_supervised2(trainer,
                                       train_data,
                                       extra_text(),
                                       estimator=my_estimator)
        else:
            tagger = trainer.train_supervised(train_data,
                                              estimator=my_estimator)
    else:
        if args.supplement:
Example #25
0
    current_sentence = []
    test = []
    for i in range(0, len(test_file) - 1):
        current_line = test_file[i]
        word_tag = current_line.split('\t\t')
        words.add(word_tag[0])
        tags.add(word_tag[1])
        current_sentence.append((word_tag[0], word_tag[1]))
        if word_tag[0] == '.':
            test.append(current_sentence)
            current_sentence = []

    tags = list(tags)
    words = list(words)
    trainer = hmm.HiddenMarkovModelTrainer(tags, words)
    # tagger = trainer.train_supervised(train, estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))
    # tagger = trainer.train_supervised(train, estimator=lambda fd, bins: MLEProbDist(fd))
    tagger = trainer.train_supervised(
        train, estimator=lambda fd, bins: SimpleGoodTuringProbDist(fd, bins))
    # tagger = trainer.train_supervised(train, estimator=lambda fd, bins: WittenBellProbDist(fd, bins))
    # tagger = trainer.train_supervised(train, estimator=lambda fd, bins: KneserNeyProbDist(fd, bins))

    print("here")
    predicted = []
    real = []
    for i in range(0, len(test) - 1):
        current = list(zip(*test[i]))
        tagged = tagger.tag(list(current[0]))
        current_tags = list(list(zip(*tagged))[1])
        predicted += current_tags