def _counter_known(parsed, train, known, discount, tag2id, word2id, prior): emission, transition = None, None if known: emission, _ = counter(parsed, tag2id, word2id, discount, prior) _, transition = counter(train, tag2id, word2id, discount, prior) else: emission, transition = counter(train, tag2id, word2id, discount, prior) return emission, transition
for j in xrange(1, len(seq)): for i in xrange(len(transition)): k_score = scores[:, j - 1] + np.log(transition[:, i]) + np.log(emission[i, seq[j]]) backpointer[i, j] = np.argmax(k_score) scores[i, j] = k_score[backpointer[i, j]] j = int(np.argmax(scores, axis=0)[-1]) sol = [j] for i in xrange(len(seq) - 1, 0, -1): j = backpointer[j, i] sol.append(j) sol.reverse() return zip(seq[:-1], sol[:-1]) if __name__ == "__main__": path = "../WSJ-2-12/*/*.POS" docs = glob(path) parsed = parse(docs) np.random.shuffle(parsed) parsed = trigramize(parsed) tag2id, word2id = build_dict(parsed[:-10]) id2word = {v: k for k, v in word2id.iteritems()} id2tag = {v: k for k, v in tag2id.iteritems()} emission, transition = counter(parsed[:-10], tag2id, word2id) print "test POS", parsed[-1][1:-1] output = viterbi(parsed[-1], transition, emission, word2id, tag2id) print "TAGGED", id_to_token(output, id2word, id2tag)