def printSpanishTags(self): sents = cess_esp.tagged_sents() tagger = HiddenMarkovModelTagger.train(sents) fullCorpus = self.fullCorpus() tagsDictionary = dict() for line in fullCorpus: spanishSentence = line[0] spanishTokens = re.compile('\W+', re.UNICODE).split(unicode(spanishSentence, 'utf-8')) tags = tagger.tag(spanishTokens) for idx, token in enumerate(spanishTokens): if (len(token) > 0): tag = tags[idx][1] sys.stdout.write(token.encode('utf-8')) sys.stdout.write(":") sys.stdout.write(tag) sys.stdout.write("\n")
# Split into training and test set # <codecell> training_dx = int(len(sents)*90/100) training = sents[:training_dx] test = sents[training_dx+1:] # <markdowncell> # train tagger and check accuracy (this takes 40 seconds or so) ... # <codecell> from nltk import HiddenMarkovModelTagger spanish_tagger = HiddenMarkovModelTagger.train(training) 'accuracy %.1f %%' % (spanish_tagger.evaluate(test) * 100) # <codecell> spanish_tagger.tag(tokenize("A buen entendedor, pocas palabras bastan.")) # <codecell> spanish_tagger.tag(tokenize("El gato blanco se sentó en la alfombra.")) # <markdowncell> # Now Portuguese # <codecell>
# <codecell> training_dx = int(len(sents) * 90 / 100) training = sents[:training_dx] test = sents[training_dx + 1:] # <markdowncell> # train tagger and check accuracy (this takes 40 seconds or so) ... # <codecell> from nltk import HiddenMarkovModelTagger spanish_tagger = HiddenMarkovModelTagger.train(training) 'accuracy %.1f %%' % (spanish_tagger.evaluate(test) * 100) # <codecell> spanish_tagger.tag(tokenize("A buen entendedor, pocas palabras bastan.")) # <codecell> spanish_tagger.tag(tokenize("El gato blanco se sentó en la alfombra.")) # <markdowncell> # Now Portuguese # <codecell>
def map(self, key, value): """ establish the hmm model and estimate the local hmm parameters from the input sequences @param key: None @param value: input sequence """ symbols, states, A, B, pi = self.read_params() N = len(states) M = len(symbols) symbol_dict = dict((symbols[i], i) for i in range(M)) model = HiddenMarkovModelTagger(symbols=symbols, states=states, \ transitions=A, outputs=B, priors=pi) logprob = 0 sequence = list(value) if not sequence: return # compute forward and backward probabilities alpha = model._forward_probability(sequence) beta = model._backward_probability(sequence) # find the log probability of the sequence T = len(sequence) lpk = _log_add(*alpha[T-1, :]) logprob += lpk # now update A and B (transition and output probabilities) # using the alpha and beta values. Please refer to Rabiner's # paper for details, it's too hard to explain in comments local_A_numer = ones((N, N), float64) * _NINF local_B_numer = ones((N, M), float64) * _NINF local_A_denom = ones(N, float64) * _NINF local_B_denom = ones(N, float64) * _NINF # for each position, accumulate sums for A and B for t in range(T): x = sequence[t][_TEXT] #not found? FIXME if t < T - 1: xnext = sequence[t+1][_TEXT] #not found? FIXME xi = symbol_dict[x] for i in range(N): si = states[i] if t < T - 1: for j in range(N): sj = states[j] local_A_numer[i, j] = \ _log_add(local_A_numer[i, j], alpha[t, i] + model._transitions[si].logprob(sj) + model._outputs[sj].logprob(xnext) + beta[t+1, j]) local_A_denom[i] = _log_add(local_A_denom[i], alpha[t, i] + beta[t, i]) else: local_B_denom[i] = _log_add(local_A_denom[i], alpha[t, i] + beta[t, i]) local_B_numer[i, xi] = _log_add(local_B_numer[i, xi], alpha[t, i] + beta[t, i]) for i in range(N): self.outputcollector.collect("parameters", \ tuple2str(("Pi", states[i], pi.prob(states[i])))) self.collect_matrix('A', local_A_numer, lpk, N, N) self.collect_matrix('B', local_B_numer, lpk, N, M) self.collect_matrix('A_denom', [local_A_denom], lpk, 1, N) self.collect_matrix('B_denom', [local_B_denom], lpk, 1, N) self.outputcollector.collect("parameters", "states " + \ tuple2str(tuple(states))) self.outputcollector.collect("parameters", "symbols " + \ tuple2str(tuple(symbols)))