コード例 #1
0
def main(train_data, test_data):
    print "Training"

    m = HiddenMarkovModelTagger.train(train_data)

    print "Predicting"
    predicted_labels = []

    for i, sent in enumerate(test_data):
        if i % 500 == 0:
            print "%d / %d" %(i, len(test_data))
        predicted_labels += [tag 
                             for _, tag in m.tag(
                                     [word for word, _ in sent]
                             )]



    correct_labels = [tag 
                      for sent in test_data
                      for _, tag in sent]

    # print predicted_labels
    # print correct_labels

    from sklearn.metrics import classification_report

    print classification_report(correct_labels, predicted_labels)
        
    correct_n = len([1 
                     for p, c in zip(predicted_labels, correct_labels) 
                     if p == c])
        
    print "Item accuracy:", float(correct_n) / len(correct_labels)
コード例 #2
0
def compare_taggers(train_data_Brown, train_data_Universal,test_data_Brown,test_data_Universal):


  tagger_Brown = HiddenMarkovModelTagger.train(train_data_Brown)
  tagger_Universal = HiddenMarkovModelTagger.train(train_data_Universal)
  

  eval_Brown = tagger_Brown.evaluate(test_data_Brown)
  eval_Universal = tagger_Universal.evaluate(test_data_Universal)
  
  answer1 = "Brown and Universal are training the same data size. Considering the brown tagset is larger than the universal tagset, they both train the same data size. As a result, universal produces more well trained set than the brown tagset, eventhough the increase in the states. Universal tagset contains more transitions and tags per possible state which creates a better observation set compares to the brown set."

  answer2 = "..."


  return eval_Brown, eval_Universal, answer1, answer2
コード例 #3
0
 def __init__(self, mode, train_sents):
     if mode == TRIGRAM:
         self.tagger = UnigramTagger(train_sents)
         self.tagger = BigramTagger(train_sents, backoff=self.tagger)
         self.tagger = TrigramTagger(train_sents, backoff=self.tagger)
     elif HDM:
         self.tagger = HiddenMarkovModelTagger.train(train_sents)
コード例 #4
0
def main(train_data, test_data):
    print "Training"

    m = HiddenMarkovModelTagger.train(train_data)

    print "Predicting"
    predicted_labels = []

    for i, sent in enumerate(test_data):
        if i % 500 == 0:
            print "%d / %d" % (i, len(test_data))
        predicted_labels += [
            tag for _, tag in m.tag([word for word, _ in sent])
        ]

    correct_labels = [tag for sent in test_data for _, tag in sent]

    # print predicted_labels
    # print correct_labels

    from sklearn.metrics import classification_report

    print classification_report(correct_labels, predicted_labels)

    correct_n = len(
        [1 for p, c in zip(predicted_labels, correct_labels) if p == c])

    print "Item accuracy:", float(correct_n) / len(correct_labels)
コード例 #5
0
ファイル: hmm.py プロジェクト: johndpope/jazzparser
 def _dict_to_object(dic):
     from .storage import dict_to_object
     states = dic['states']
     symbols = dic['symbols']
     priors = dict_to_object(dic['priors'])
     outputs = dict_to_object(dic['outputs'])
     transitions = dict_to_object(dic['transitions'])
     return HiddenMarkovModelTagger(symbols, states, transitions, outputs,
                                    priors)
コード例 #6
0
def build_manual():
	seqs = []
	seqs.insert(0,sen11)
	seqs.insert(0,sen10)
	seqs.insert(0,sen9)
	seqs.insert(0,sen8)
	seqs.insert(0,sen6)
	seqs.insert(0,sen5)
	seqs.insert(0,sen4)
	seqs.insert(0,sen3)
	seqs.insert(0,sen2)
	seqs.insert(0,sen1)
 	result = Hmm.train(seqs)
 	
 	return result
コード例 #7
0
 def load(cls, filename):
     """
     Loads and deserializes the pickle file (Diacritics restorer) saved on given path
     :param filename: load path
     :return: The loaded diacritics restorer object
     :rtype: HmmNgramRestorer
     """
     with open(filename, 'rb') as file:
         dump = pickle.load(file)
         hmm = cls(dump["n"])
         dump = dump["tagger"]
         hmm.tagger = HiddenMarkovModelTagger(dump["_symbols"],
                                              dump["_states"],
                                              dump["_transitions"],
                                              dump["_outputs"],
                                              dump["_priors"])
     return hmm
コード例 #8
0
    def train(self, labeled_sequence):
        def estimator(fd, bins):
            return LidstoneProbDist(fd, 0.1, bins)

        labeled_sequence = LazyMap(_identity, labeled_sequence)
        symbols = unique_list(word for sent in labeled_sequence for word, tag in sent)
        tag_set = unique_list(tag for sent in labeled_sequence for word, tag in sent)

        trainer = HiddenMarkovModelTrainer(tag_set, symbols)
        hmm = trainer.train_supervised(labeled_sequence, estimator=estimator)
        hmm = HiddenMarkovModelTagger(
            hmm._symbols,
            hmm._states,
            hmm._transitions,
            hmm._outputs,
            hmm._priors,
            transform=_identity,
        )
        self.tagger = hmm
コード例 #9
0
def hmm(train_path, test_path):
    training_sentences = list(gen_corpus(train_path))
    test_sentences = list(gen_corpus(test_path))

    start = perf_counter()

    hmm_model = HiddenMarkovModelTagger.train(list(convert_sents_to_zipped(training_sentences)))

    end = perf_counter()
    print('Training took {} ms.'.format(int((end - start) * 1000)))
    start = perf_counter()
    # Evaluation
    y_pred, y_true = [], []
    for words, tags in test_sentences:
        y_pred.extend(y for x, y in hmm_model.tag(words))
        y_true.extend(tags)

    end = perf_counter()
    print('Testing took {} ms.'.format(int((end - start) * 1000)))

    for l in classification_report(y_true, y_pred).split('\n'):
        print(l)
コード例 #10
0
def construct_hmm(japanese,english):
	"""
	INPUT:
		*Aligned* parallel arrays
	OUTPUT: 
		An HMM
	"""
	print "[Start] Training HMM"
	# Coming in, we have two parallel arrays of arrays.
	# [ ['j1', 'j2'], ['j1', 'j2'] ] + [ ['e1', 'e2'], ['e1', 'e2'] ]
	# What we need is an array of combined tuples
	# [ [ [j1,e1],[j2,e2] ], [ [j1,e1],[j2,e2] ] ]
	training_data = []
	for i in range(len(japanese)):
		sequence = []
		j_word = japanese[i]
		e_word = english[i]
		for j in range(len(j_word)):
			sequence.append((e_word[j],j_word[j]))
		training_data.append(sequence)
	model = Hmm.train(training_data)
	print "[ End ] Training HMM"
	return model
コード例 #11
0
def procesado_bigram(texto_entrada):
    return 0


def procesado_naive(texto_entrada):
    return 0


##############################################################################

#Entrenamiento de los tagger
if path.exists('spanish_hmm.plk'):
    hmm_tagger = joblib.load('spanish_hmm.plk')
else:
    #Entrenamos el Hidden tagger y lo guardamos en un fichero para sucesivas ocasiones
    hmm_tagger = HiddenMarkovModelTagger.train(cess_sents)
    with open('spanish_hmm.plk', 'wb') as pickle_file:
        dill.dump(hmm_tagger, pickle_file)

#CAMBIAR ESTO -> PRIMERO EJECUTAR EL REGEX LUEGO EL RESTO EN ORDEN...
#Menú principal
print("Selección una Opción:")
print("1.Entrenamiento RegexParser.")
print("2.Test.")
print("3.Salir.")
opcion = input()

if int(opcion) == 1:
    print("Entrenando RegexParser...")
    train_regex(corpus_ejemplo)
コード例 #12
0
def main():
    parser = argparse.ArgumentParser(description='Text decipher options')
    parser.add_argument('cipher_folder', help='cipher data folder')
    parser.add_argument('--laplace',
                        '-laplace',
                        action='store_true',
                        default=False,
                        help='Laplace Smoothing')
    parser.add_argument('--langmod',
                        '-lm',
                        action='store_true',
                        default=False,
                        help='Improved decoder')

    args = parser.parse_args()
    cipher_folder = args.cipher_folder
    laplace = args.laplace
    langmod = args.langmod
    number_of_supp_lines = 100  #the more lines the slower the code!

    train_data, test_data, train_plain = get_data(cipher_folder)
    preprocess_supp_data()
    supp_data = read_preprocessed_supp_data(number_of_supp_lines)
    for line in train_plain:  #this is so later we have all the transitions in the same place
        supp_data.extend(list(line))

    if laplace:
        smoothing = LaplaceProbDist
    else:
        smoothing = MLEProbDist

    trainer = hmm.HiddenMarkovModelTrainer()
    decoder = trainer.train_supervised(train_data, smoothing)

    #decoder_supp = trainer_supp.train_unsupervised(supp_data, update_outputs=False, model=decoder)
    #because there's a bug in train_unsupervised (although I found out how to fix it!), I will have to do this manually....
    #code copied from the nltk train_supervised method
    #here, we are updating the transition data to include our supplemental data
    if langmod:
        states = decoder._states
        symbols = decoder._symbols
        outputs = decoder._outputs
        priors = decoder._priors
        starting = FreqDist()  #declaring
        transitions = ConditionalFreqDist(
        )  #declaring, why we needed all the transitions in the same place
        for item in supp_data:
            for sequence in supp_data:
                lasts = None
                for state in sequence:
                    if lasts is None:
                        starting[state] += 1
                    else:
                        transitions[lasts][state] += 1
                    lasts = state

        if laplace:
            estimator = LaplaceProbDist
        else:
            estimator = lambda fdist, bins: MLEProbDist(
                fdist)  #getting this straight from the source code

        N = len(states)
        pi = estimator(starting, N)
        A = ConditionalProbDist(transitions, estimator, N)
        #conditionalPD is actually already defined by our previously trained model as outputs.
        #we don't have new ones!
        decoder = HiddenMarkovModelTagger(symbols, states, A, outputs, pi)

    print(decoder.test(test_data))
    for sent in test_data:
        print "".join([y[1] for y in decoder.tag([x[0] for x in sent])])
コード例 #13
0
ファイル: hmm.py プロジェクト: tomerfiliba/tau
from nltk.tag.hmm import HiddenMarkovModelTagger, DictionaryConditionalProbDist, DictionaryProbDist

def prob(**kwargs):
    return DictionaryProbDist(kwargs)
def condprob(**kwargs):
    return DictionaryConditionalProbDist(kwargs)

hmm = HiddenMarkovModelTagger(
    symbols = ["sound", "sounds", "nice", "dot"],
    states = ["ADJ", "N", "V", "END"],
    transitions = condprob(
        ADJ = prob(N = 0.4, V = 0.4, END = 0.2),
        N = prob(ADJ = 0.2, V = 0.7, END = 0.1),
        V = prob(N = 0.5, END = 0.5),
    ),
    outputs = condprob(
        ADJ = prob(sound = 0.3, nice = 0.7),
        N = prob(sound = 0.5, nice = 0.5),
        V = prob(sound = 0.8, sounds = 0.2),
        END = prob(dot = 1.0),
    ),
    priors = prob(ADJ = 0.3, N = 0.4, V = 0.3, END = 0)
)

for words in ["nice sound dot", "sound sounds nice dot", "sound sound sound dot"]:
    tagged = hmm.tag(words.split())
    print "Best tags:", tagged
    print "Forward probability:", hmm.probability([(w, None) for w in words.split()])
    print "Sequence probability:", hmm.probability(tagged)
    print
コード例 #14
0
ファイル: hmm_full.py プロジェクト: fbdurak/hmm
# MODIFY: Comment/uncomment to modify features
# first_letters_counter = Counter([transform_states(list(n),transforms=TRANSFORM_METHOD)[0] for n in names])
first_letters_counter = Counter([transform_states(list(n),transforms=TRANSFORM_METHOD,**{'ngram_length':NGRAM_LENGTH})[0] for n in names])


first_letters_total = sum([first_letters_counter[l] for l in first_letters_counter.keys()])
for letter in first_letters_counter.keys():
    priors[letter] = first_letters_counter[letter]/float(first_letters_total)
print priors
print states
priors = DictionaryProbDist(priors)



print "TRAINING"
tagger = HiddenMarkovModelTagger(symbols,states,transitions,outputs,priors)







observations = None
with open('data/input-data/leakedBits.txt','r') as inf:
    observations = inf.read().split('\n')


labels = None
with open('data/input-data/names.txt','r') as inf:
    labels = inf.read().split('\n')
コード例 #15
0
 # transition prob
 for row in X_train:
     lasts = None
     for ch in list(row):
         if(lasts is not None):
             transitional[lasts][ch] += 1
         lasts = ch
  
 # emission prob
 for row in sequences:
     for pair in row:
         emissional[pair[1]][pair[0]] += 1
 
 if(improved_laplace): 
     print("################## Laplace ####################### \n")
     estimator= nltk.probability.LaplaceProbDist 
 else:
     estimator = lambda fdist, bins: MLEProbDist(fdist)
     
 N = len(symbols)
 PI = estimator(Pi, N)
 A = ConditionalProbDist(transitional, estimator, N)
 B = ConditionalProbDist(emissional, estimator ,N)
  
 tagger = HiddenMarkovModelTagger(states, symbols, A, B, PI)
 print("\n ################## C{} Decryption Results #######################".format(int(i)) )
 for row in test_cipher:
     print(tagger.best_path(row))
 
 print("\n ################## C{} Accuracy Results #######################". format(int(i)) )
 print(tagger.test(tester))
コード例 #16
0
ファイル: tasks.py プロジェクト: alabarga/MORElab
from celery.signals import celeryd_init
from multiprocessing import Pool

DIFF_THRESHOLD = 0.75
#twitter_stream = None
app = Celery('tasks', broker='redis://localhost:6379/0')



conn = psycopg2.connect("dbname=%s user=%s password=%s" % (postgres_db, postgres_user, postgres_pass))
cur = conn.cursor()

# redis = redis.StrictRedis(host='localhost', port=6379, db=0)

sents = conll2002.tagged_sents()
hmm_tagger = HiddenMarkovModelTagger.train(sents)

print 'Tagger ready'

def analyze(text, track_list):
    tokens = word_tokenize(text)
    tags = hmm_tagger.tag(tokens)
    for tag in tags:
        if tag[0] in track_list:
            if tag[1].startswith('N') and len(tag[1]) <= 2:
                print text
                print tag
                return True
                break
    return False