def gen_pinyin2hanzi(pinyin, num=5): results = {} hmmparams = DefaultHmmParams() result = viterbi(hmm_params=hmmparams, observations=pinyin, path_num=num, log=True) for item in result: # results.setdefault(''.join(item.path),0) results[''.join(item.path)] = item.score return results
def top_k_transform(importance_score, list_of_texts, porpotion, new_word_dictionary, black_list_word): hmmparams = DefaultHmmParams() # HMM pinyin2hanzi target_text = list_of_texts target_text = tokenize(target_text).split(' ') k = int(len(target_text)*porpotion) + 1 top_k_score = heapq.nlargest(k, importance_score) top_k_score_index = [importance_score.index(score) for score in top_k_score] for index in top_k_score_index: # make a virables repsent modified list_of_text gedit_text = copy.deepcopy(list_of_texts) if(is_Chinese(target_text[index])): pinyin_of_target_text = lazy_pinyin(target_text[index]) if pinyin_of_target_text == ['ni']: pinyin_of_target_text = random.choice([['li'], ['ni']]) if pinyin_of_target_text == ['ta']: pinyin_of_target_text = random.choice([['ta'], ['te']]) if pinyin_of_target_text == ['cao']: pinyin_of_target_text = random.choice([['ca'], ['cao']]) if pinyin_of_target_text == ['ma']: pinyin_of_target_text = random.choice([['me'], ['ma']]) if pinyin_of_target_text == ['si']: pinyin_of_target_text = random.choice([['shi'], ['si']]) try: #pinyin to other Chinese hanzi_of_target_test = viterbi(hmm_params=hmmparams, observations=pinyin_of_target_text, path_num = 10) # choose a word randly # target_text[index] = ''.join(random.choice(hanzi_of_target_test).path) # caculate the similarity between original word and transferable word # use greedy algorithm m_destination_word = calculate_similarity(target_text, index, list_of_texts[i], hanzi_of_target_test, gedit_text, black_list_word) target_text[index] = m_destination_word list_of_texts = ''.join(target_text) # 加入新词字典 temp = new_word_dictionary.get(m_destination_word,0) temp += 1 # 如果这个新词已经出现了10次,那么把它加到黑名单里 if(temp < 20): new_word_dictionary[m_destination_word] = temp else: new_word_dictionary.pop(m_destination_word) black_list_word.append(m_destination_word) except: pass else: continue return list_of_texts
def get(self): """get请求""" word = self.get_argument('word') + " end" num = self.get_argument('num') list = [] result = viterbi(hmm_params=hmmparams, observations=(tuple(word.split(" "))), path_num=num, log=True) for item in result: result = {item.score, item.path} list.append(result) print(item.score, item.path) self.write(json.dumps(list))
def transfer_pinyin_to_hanzi_by_hmm(sets): """ HMM模式拼音转汉字 :param sets: :return: """ try: result = viterbi(hmm_params=hmmparams, observations=sets, path_num=1, log=True) path = '' for item in result: path = item.path except Exception as error: raise Exception('error:', error) else: return path
#!/usr/bin/env python # -*- coding: utf-8 -*- from pypinyin import pinyin, lazy_pinyin, Style from Pinyin2Hanzi import DefaultHmmParams from Pinyin2Hanzi import viterbi txt = u'锄禾日当午' py = lazy_pinyin(txt) hmmparams = DefaultHmmParams() result = viterbi(hmm_params=hmmparams, observations=py, path_num = 1) for item in result: txt_rtn = u''.join(item.path) if txt == txt_rtn: print u'OK' else: print u'Error: %s -> %s -> %s' % (txt, py, txt_rtn)
def emission(self, state, observation): ''' state (hanzi) -> observation (pinyin) ''' return self.emission_probability[state][observation] def transition(self, from_state, to_state): ''' state -> state ''' return self.transition_probability[from_state][to_state] def get_states(self, observation): ''' get states which produce the given obs ''' return self.states result = viterbi(hmm_params=HmmParams(), observations=('normal', 'cold', 'dizzy'), path_num=10, log=False) for item in result: print(item.score, item.path) print(20 * '--') result = viterbi(hmm_params=HmmParams(), observations=('normal', 'cold', 'dizzy'), path_num=2, log=False) for item in result: print(item.score, item.path) print(20 * '--')
# coding: utf-8 from __future__ import (print_function, unicode_literals) import sys sys.path.append('..') from Pinyin2Hanzi import DefaultHmmParams from Pinyin2Hanzi import viterbi hmmparams = DefaultHmmParams() result = viterbi(hmm_params=hmmparams, observations=('ni', 'hao', 'a'), path_num=5, log=True) for item in result: print(item.score, '/'.join(item.path)) print(20 * '--') result = viterbi(hmm_params=hmmparams, observations=('ni', 'hao', 'a'), path_num=2, log=True) for item in result: print(item.score, '/'.join(item.path)) print(20 * '--') result = viterbi(hmm_params=hmmparams, observations=('chuang', 'qian', 'ming', 'yve', 'guang'),
return self.start_probability[state] def emission(self, state, observation): ''' state (hanzi) -> observation (pinyin) ''' return self.emission_probability[state][observation] def transition(self, from_state, to_state): ''' state -> state ''' return self.transition_probability[from_state][to_state] def get_states(self, observation): ''' get states which produce the given obs ''' return self.states result = viterbi(hmm_params=HmmParams(), observations=('normal', 'cold', 'dizzy'), path_num = 10, log = False) for item in result: print(item.score, item.path) print(20*'--') result = viterbi(hmm_params=HmmParams(), observations=('normal', 'cold', 'dizzy'), path_num = 2, log = False) for item in result: print(item.score, item.path) print(20*'--') result = viterbi(hmm_params=HmmParams(), observations=('normal', 'cold', 'dizzy'), path_num = 1, log = False) for item in result: print(item.score, item.path)
def interactive(client, addr): global re_string, pyt seq = [] with open("Files/simple_words.txt", "r", encoding='UTF-8') as read: for line in read: line = line.strip() seq.append(line) voca = {} with open("Files/vocabulary.txt", "r", encoding='UTF-8') as read: for line in read: line = line.strip().split(":") voca[line[0]] = line[1] while True: try: text = client.recv(4096).decode('UTF-8') text = text.split('\t')[1] re_string = "" if text == "" or text == "send": text = 'send\t' + "" client.send(text.encode('UTF-8')) print("Text is NULL but send") else: if "send" in text: text = text[0:len(text) - 4] words = pyt.scan(str(text)) re_tup = tuple() for item in words: if "invalid" not in item: # tmp += item + "\'" if not is_pinyin(item): item = simplify_pinyin(item) tmp = (item, ) re_tup += tmp print("Recv: ", end="") print(re_tup) re_string = "" try: re_string = "" result = viterbi(hmm_params=hmm, observations=re_tup, path_num=3, log=True) n = 0 for item in result: n += 1 string_t = str(n) + "." re_string += string_t for word in item.path: re_string += word re_string += "\n" tmp = "" for item in result: for word in item.path: tmp += word break for i in range(0, len(seq)): if tmp in seq[i][0:len(tmp)] and len( seq[i]) > len(tmp): n += 1 string_t = str(n) + "." re_string += string_t + seq[i] + "\n" if n == 6: break if text in voca.keys(): n += 1 string_t = str(n) + "." re_string += string_t + voca[text] + "\n" dan_tup = (re_tup[0], ) result = viterbi(hmm_params=hmm, observations=dan_tup, path_num=100, log=True) for item in result: n += 1 if n == 9: n = 1 string_t = str(n) + "." re_string += string_t for word in item.path: re_string += word re_string += "\n" re_string = "send\t" + re_string client.send(re_string.encode('UTF-8')) print(re_string) except: print("Recv Error") except: print(str(addr) + 'is out') client.close() break
from Pinyin2Hanzi import DefaultHmmParams from Pinyin2Hanzi import viterbi hmmparams = DefaultHmmParams() ## 2个候选 result = viterbi(hmm_params=hmmparams, observations=('ni', 'zhi', 'bu', 'zhi', 'dao'), path_num=2) for item in result: print(item.score, item.path) '''输出 1.3155294593897203e-08 ['你', '知', '不', '知', '道'] 3.6677865125992192e-09 ['你', '只', '不', '知', '道'] ''' ## 2个候选,使用对数打分 result = viterbi(hmm_params=hmmparams, observations=('ni', 'zhi', 'bu', 'zhi', 'dao'), path_num=2, log=True) for item in result: print(item.score, item.path) '''输出 -18.14644152864202 ['你', '知', '不', '知', '道'] -19.423677486918002 ['你', '只', '不', '知', '道'] ''' ## 2个候选,使用对数打分 # result = viterbi(hmm_params=hmmparams, observations=('ni', 'zhii', 'bu', 'zhi', 'dao'), path_num = 2, log = True) # for item in result: