def test(): vocab_path = '../../data/people_char_vocab.pkl' vocabs = load_vocab(vocab_path) train_data_path = '../../data/people.txt' gen = train_generator(train_data_path, vocabs=vocabs) states = ['B', 'M', 'E', 'S'] hmm = HMM(vocabs=vocabs, states=states) #hmm.train(train_generator=gen) model_dir = '../../models/hmm' #hmm.save_model(model_dir=model_dir) hmm.load_model(model_dir=model_dir) sentence = "我是中国人,我爱我的祖国" decode_states={0: 'B', 1: 'M', 2: 'E', 3: 'S'} hiddens = hmm.decode(outputs=sentence, decode_states=decode_states) words = hmm.format_hiddens(hiddens, sentence) print(hiddens) print('/ '.join(words)) sentence = '4月29日,雄浑悠长的钟声响起,关闭了近百日的武汉黄鹤楼重新开门迎客。这钟声,传递出中华民族从磨难中奋起的昂扬斗志,彰显出伟大民族精神在新时代焕发出的熠熠光辉。' hiddens = hmm.decode(outputs=sentence, decode_states=decode_states) words= hmm.format_hiddens(hiddens, sentence) print('/ '.join(words))
def generate_text(): vocab_path = '../../data/people_char_vocab.pkl' model_dir = '../../models/hmm' states = ['B', 'M', 'E', 'S'] vocabs = load_vocab(vocab_path) query_vocabs = {idx: char for char, idx in vocabs.items()} hmm = HMM(vocabs=vocabs, states=states) hmm.load_model(model_dir=model_dir) pi = hmm.pi_cnt tran_p = hmm.trans_cnt # [S, S] emit_p = hmm.emit_cnt # [S, V] # [S, S] trans_cdfs = [compute_cdf(tran_p[s, :]) for s in range(tran_p.shape[0])] # [S, V] emit_cdfs = [compute_cdf(emit_p[s, :]) for s in range(emit_p.shape[0])] state_idx = sample_start(pi) out_idx = sample_output(state_idx, emit_cdfs) out_char = query_vocabs[out_idx] num_text = 1000 print(out_char, end='') for i in range(num_text - 1): state_idx = sample_output(state=state_idx, cdfs=trans_cdfs) out_idx = sample_output(state=state_idx, cdfs=emit_cdfs) out_char = query_vocabs[out_idx] print(out_char, end='') if (i + 1) % 50 == 0: print()
def train(): vocab_path = '../../data/people_char_vocab.pkl' vocabs = load_vocab(vocab_path) train_data_path = '../../data/people.txt' gen = train_generator(train_data_path, vocabs=vocabs) states = ['B', 'M', 'E', 'S'] hmm = HMM(vocabs=vocabs, states=states) hmm.train(train_generator=gen) model_dir = '../../models/hmm' hmm.save_model(model_dir=model_dir)
def test_hmm(): vocab_path = '../../data/people_char_vocab.pkl' model_dir = '../../models/hmm' states = ['B', 'M', 'E', 'S'] decode_states = {0: 'B', 1: 'M', 2: 'E', 3: 'S'} vocabs = load_vocab(vocab_path) hmm = HMM(vocabs=vocabs, states=states) hmm.load_model(model_dir=model_dir) sentence = "我是中国人,我爱我的祖国" hiddens = hmm.decode(outputs=sentence, decode_states=decode_states) words = hmm.format_hiddens(hiddens, sentence) print(hiddens) print('/ '.join(words)) sentence = '4月29日,雄浑悠长的钟声响起,关闭了近百日的武汉黄鹤楼重新开门迎客。' \ '这钟声,传递出中华民族从磨难中奋起的昂扬斗志,彰显出伟大民族精神在新时代焕发出的熠熠光辉。' sentence = 'I love you china' hiddens = hmm.decode(outputs=sentence, decode_states=decode_states) words = hmm.format_hiddens(hiddens, sentence) print('/ '.join(words))
return 0 if __name__ == '__main__': base_dir = "/Users/flyingman/Developer/github/chinese_segmentation" instance_path = os.path.join(base_dir, "data/people_instance.txt") feature_path = os.path.join(base_dir, "data/people_feature.txt") model_dir = os.path.join(base_dir, "models/max_entropy") states = ['B', 'M', 'E', 'S'] vocab_path = '../../data/people_char_vocab.pkl' train_data_path = '../../data/people.txt' features = [func_cur, func_is_symbol] model_dir = '../../models/max_entropy' epochs = 1000 vocab = load_vocab(vocab_path=vocab_path) #print(vocab) max_ent = MaximumEntropy(line_limit=100, states=states) max_ent.train(feature_path=feature_path, epochs=epochs) """ sentences = "我是中国人" seg_words = max_ent.predict(sentences=sentences) print(seg_words) """