N_CLASS = 4 # B: 0, M: 1, E: 2, S: 3 N_EPOCH = 1 sample = '我来到大学读书,希望学到知识' py = int(sys.version[0]) def to_seq(*args): data = [] for x in args: x = x[:(len(x) - len(x) % SEQ_LEN)] data.append(np.reshape(x, [-1, SEQ_LEN])) return data if __name__ == '__main__': x_train, y_train, x_test, y_test, vocab_size, char2idx, idx2char = chseg.load_data( ) X_train, X_test, Y_train, Y_test = to_seq(x_train, x_test, y_train, y_test) print('Vocab size: %d' % vocab_size) clf = BiRNN_CRF(SEQ_LEN, vocab_size, N_CLASS) clf.fit(X_train, Y_train, val_data=(X_test, Y_test), n_epoch=N_EPOCH) chars = list(sample) if py == 3 else list(sample.decode('utf-8')) labels = clf.infer([char2idx[c] for c in chars]) res = '' for i, l in enumerate(labels): c = sample[i] if py == 3 else sample.decode('utf-8')[i] if l == 2 or l == 3: c += ' ' res += c print(res)
N_CLASS = 4 # B: 0, M: 1, E: 2, S: 3 N_EPOCH = 1 sample = '我来到大学读书,希望学到知识' py = int(sys.version[0]) def to_seq(*args): data = [] for x in args: x = x[: (len(x) - len(x) % SEQ_LEN)] data.append(np.reshape(x, [-1, SEQ_LEN])) return data if __name__ == '__main__': x_train, y_train, x_test, y_test, vocab_size, char2idx, idx2char = chseg.load_data() X_train, X_test, Y_train, Y_test = to_seq(x_train, x_test, y_train, y_test) print('Vocab size: %d' % vocab_size) clf = BiRNN(SEQ_LEN, vocab_size, N_CLASS, n_layer=2) clf.fit(X_train, Y_train, val_data=(X_test, Y_test), n_epoch=N_EPOCH) chars = list(sample) if py == 3 else list(sample.decode('utf-8')) labels = clf.infer([char2idx[c] for c in chars]) res = '' for i, l in enumerate(labels): c = sample[i] if py == 3 else sample.decode('utf-8')[i] if l == 2 or l == 3: c += ' ' res += c print(res)
def to_test_seq(*args): data = [] for x in args: x = x[:(len(x) - len(x) % SEQ_LEN)] data.append(np.reshape(x, [-1, SEQ_LEN])) return data def iter_seq(x, text_iter_step=3): return np.array( [x[i:i + SEQ_LEN] for i in range(0, len(x) - SEQ_LEN, text_iter_step)]) if __name__ == '__main__': x_train, y_train, x_test, y_test, vocab_size, word2idx, idx2word = chseg.load_data( ) X_train, Y_train = to_train_seq(x_train, y_train) X_test, Y_test = to_test_seq(x_test, y_test) print('Vocab size: %d' % vocab_size) clf = Tagger(vocab_size, N_CLASS, SEQ_LEN, num_blocks=3) clf.fit(X_train, Y_train, val_data=(X_test, Y_test), n_epoch=N_EPOCH, batch_size=BATCH_SIZE) chars = list(sample) if py == 3 else list(sample.decode('utf-8')) _test = [word2idx[w] for w in sample] + [0] * (SEQ_LEN - len(sample)) labels = clf.infer(_test, len(sample)) labels = labels[:len(sample)]