コード例 #1
0
def _counter_known(parsed, train, known, discount, tag2id, word2id, prior):
    emission, transition = None, None
    if known:
        emission, _ = counter(parsed, tag2id, word2id, discount, prior)
        _, transition = counter(train, tag2id, word2id, discount, prior)
    else:
        emission, transition = counter(train, tag2id, word2id, discount, prior)

    return emission, transition
コード例 #2
0
    for j in xrange(1, len(seq)):
        for i in xrange(len(transition)):
            k_score = scores[:, j - 1] + np.log(transition[:, i]) + np.log(emission[i, seq[j]])
            backpointer[i, j] = np.argmax(k_score)
            scores[i, j] = k_score[backpointer[i, j]]

    j = int(np.argmax(scores, axis=0)[-1])
    sol = [j]
    for i in xrange(len(seq) - 1, 0, -1):
        j = backpointer[j, i]
        sol.append(j)
    sol.reverse()
    return zip(seq[:-1], sol[:-1])


if __name__ == "__main__":
    path = "../WSJ-2-12/*/*.POS"
    docs = glob(path)

    parsed = parse(docs)

    np.random.shuffle(parsed)
    parsed = trigramize(parsed)
    tag2id, word2id = build_dict(parsed[:-10])
    id2word = {v: k for k, v in word2id.iteritems()}
    id2tag = {v: k for k, v in tag2id.iteritems()}
    emission, transition = counter(parsed[:-10], tag2id, word2id)
    print "test POS", parsed[-1][1:-1]
    output = viterbi(parsed[-1], transition, emission, word2id, tag2id)
    print "TAGGED", id_to_token(output, id2word, id2tag)