def k_fold_cross_valid_known(k, parsed, known, discounts): res = defaultdict(list) for train, test in _fold(parsed, k): for discount in discounts: print 'train: ', len(train), 'test: ', len(test) tag2id, word2id = build_dict(parsed) id2tag = {v: k for k, v in tag2id.iteritems()} id2word = {v: k for k, v in word2id.iteritems()} emission, transition = _counter_known(parsed, train, known, 0.85, tag2id, word2id, discount) count_ok, count_total = 0., 0. for i, seq in enumerate(test): out = viterbi(seq, transition, emission, word2id, tag2id) ok, total = _compare(seq[1:-1], id_to_token(out, id2word, id2tag)) count_ok += ok; count_total += total if DEBUG: print 'evaluating', i, 'th sentence.', count_ok/count_total, 'so far.' res[discount].append(count_ok/count_total) print 'Fold accuracy: ', res[discount][-1], 'discount: ', discount for d in res: print 'discount:', d, '->', 'avg:', np.mean(res[d])
for j in xrange(1, len(seq)): for i in xrange(len(transition)): k_score = scores[:, j - 1] + np.log(transition[:, i]) + np.log(emission[i, seq[j]]) backpointer[i, j] = np.argmax(k_score) scores[i, j] = k_score[backpointer[i, j]] j = int(np.argmax(scores, axis=0)[-1]) sol = [j] for i in xrange(len(seq) - 1, 0, -1): j = backpointer[j, i] sol.append(j) sol.reverse() return zip(seq[:-1], sol[:-1]) if __name__ == "__main__": path = "../WSJ-2-12/*/*.POS" docs = glob(path) parsed = parse(docs) np.random.shuffle(parsed) parsed = trigramize(parsed) tag2id, word2id = build_dict(parsed[:-10]) id2word = {v: k for k, v in word2id.iteritems()} id2tag = {v: k for k, v in tag2id.iteritems()} emission, transition = counter(parsed[:-10], tag2id, word2id) print "test POS", parsed[-1][1:-1] output = viterbi(parsed[-1], transition, emission, word2id, tag2id) print "TAGGED", id_to_token(output, id2word, id2tag)