def generate_text(): vocab_path = '../../data/people_char_vocab.pkl' model_dir = '../../models/hmm' states = ['B', 'M', 'E', 'S'] vocabs = load_vocab(vocab_path) query_vocabs = {idx: char for char, idx in vocabs.items()} hmm = HMM(vocabs=vocabs, states=states) hmm.load_model(model_dir=model_dir) pi = hmm.pi_cnt tran_p = hmm.trans_cnt # [S, S] emit_p = hmm.emit_cnt # [S, V] # [S, S] trans_cdfs = [compute_cdf(tran_p[s, :]) for s in range(tran_p.shape[0])] # [S, V] emit_cdfs = [compute_cdf(emit_p[s, :]) for s in range(emit_p.shape[0])] state_idx = sample_start(pi) out_idx = sample_output(state_idx, emit_cdfs) out_char = query_vocabs[out_idx] num_text = 1000 print(out_char, end='') for i in range(num_text - 1): state_idx = sample_output(state=state_idx, cdfs=trans_cdfs) out_idx = sample_output(state=state_idx, cdfs=emit_cdfs) out_char = query_vocabs[out_idx] print(out_char, end='') if (i + 1) % 50 == 0: print()
def test(): vocab_path = '../../data/people_char_vocab.pkl' vocabs = load_vocab(vocab_path) train_data_path = '../../data/people.txt' gen = train_generator(train_data_path, vocabs=vocabs) states = ['B', 'M', 'E', 'S'] hmm = HMM(vocabs=vocabs, states=states) #hmm.train(train_generator=gen) model_dir = '../../models/hmm' #hmm.save_model(model_dir=model_dir) hmm.load_model(model_dir=model_dir) sentence = "我是中国人,我爱我的祖国" decode_states={0: 'B', 1: 'M', 2: 'E', 3: 'S'} hiddens = hmm.decode(outputs=sentence, decode_states=decode_states) words = hmm.format_hiddens(hiddens, sentence) print(hiddens) print('/ '.join(words)) sentence = '4月29日,雄浑悠长的钟声响起,关闭了近百日的武汉黄鹤楼重新开门迎客。这钟声,传递出中华民族从磨难中奋起的昂扬斗志,彰显出伟大民族精神在新时代焕发出的熠熠光辉。' hiddens = hmm.decode(outputs=sentence, decode_states=decode_states) words= hmm.format_hiddens(hiddens, sentence) print('/ '.join(words))
def lihang_example(): T = np.array([[0.5, 0.2, 0.3], [0.3, 0.5, 0.2], [0.2, 0.3, 0.5]]) E = np.array([[0.5, 0.5], [0.4, 0.6], [0.7, 0.3]]) pi = np.array([0.2, 0.4, 0.4]) #states = [0, 1, 2] states = {'a': 0, 'b': 1, 'c': 2} vocabs = {'red': 0, 'white': 1} hmm = HMM(states=states, vocabs=vocabs, pi=pi, trans_p=T, emit_p=E) O = ['red', 'white', 'red'] f_prob = hmm.forward_evaluate(O) print('forward prob', f_prob) b_prob = hmm.backward_evaluate(O) print('backward prob', b_prob) decode_states = {0: 'a', 1: 'b', 2: 'c'} hiddens = hmm.decode(O, decode_states=decode_states) print('optimal hiddens', hiddens)
def train(): vocab_path = '../../data/people_char_vocab.pkl' vocabs = load_vocab(vocab_path) train_data_path = '../../data/people.txt' gen = train_generator(train_data_path, vocabs=vocabs) states = ['B', 'M', 'E', 'S'] hmm = HMM(vocabs=vocabs, states=states) hmm.train(train_generator=gen) model_dir = '../../models/hmm' hmm.save_model(model_dir=model_dir)
def mini_train(): states = ['B', 'M', 'E', 'S'] vocabs = {'我': 0, '是': 1, '中': 2, '国': 3, '人': 4, '家': 5} corpus = ['我 是 中国 人', '中国 是 我 家'] hmm = HMM(vocabs=vocabs) #hmm.train(train_generator=mini_generator(corpus), max_seq_len=2) #hmm.save_model(model_dir='../../models/hmm') hmm.load_model(model_dir='../../models/hmm') hmm.cut(sentence='我是中国人')
def test_hmm(): vocab_path = '../../data/people_char_vocab.pkl' model_dir = '../../models/hmm' states = ['B', 'M', 'E', 'S'] decode_states = {0: 'B', 1: 'M', 2: 'E', 3: 'S'} vocabs = load_vocab(vocab_path) hmm = HMM(vocabs=vocabs, states=states) hmm.load_model(model_dir=model_dir) sentence = "我是中国人,我爱我的祖国" hiddens = hmm.decode(outputs=sentence, decode_states=decode_states) words = hmm.format_hiddens(hiddens, sentence) print(hiddens) print('/ '.join(words)) sentence = '4月29日,雄浑悠长的钟声响起,关闭了近百日的武汉黄鹤楼重新开门迎客。' \ '这钟声,传递出中华民族从磨难中奋起的昂扬斗志,彰显出伟大民族精神在新时代焕发出的熠熠光辉。' sentence = 'I love you china' hiddens = hmm.decode(outputs=sentence, decode_states=decode_states) words = hmm.format_hiddens(hiddens, sentence) print('/ '.join(words))
fw.close() if __name__ == "__main__": data_dir = get_data_dir() model_dir = get_model_dir() model_path = os.path.join(model_dir, "hmm", "hmm.pkl") test_path = os.path.join(data_dir, "msr_test.utf8") test_result_path = os.path.join(data_dir, "msr_test_hmm.utf8") dict_path = os.path.join(data_dir, "msr.dict") word_dict = load_dictionary(dict_path=dict_path) print("Total number of words is: %d\n" % (len(word_dict))) hmm = HMM() hmm.load_model(model_path=model_path, is_training=False) seg_res = seg_on_sentence(hmm, sentence='黑夜给了我黑色的眼睛,我却用它寻找光明。') print("/".join(seg_res)) seg_on_file(model=hmm, test_path=test_path, test_result_path=test_result_path, is_use_matching=True, matching_method="bimm", max_num_char=6, word_dict=word_dict) print("Segmentation done!", test_result_path)
from cangjie.hmm.hmm import HMM from cangjie.utils.config import get_data_dir, get_model_dir import os if __name__ == '__main__': data_dir = get_data_dir() model_dir = get_model_dir() model_path = os.path.join(model_dir, "hmm", "hmm.pkl") hmm = HMM() # train_data_path = os.path.join(data_dir, "msr_training.utf8") #hmm.train(train_path=train_data_path, model_path=model_path, is_incre_train=False) train_data_path = os.path.join(data_dir, "people.txt") hmm.train(train_path=train_data_path, model_path=model_path, is_incre_train=True)