def gen_emission(): """ base_emission = {} #> {'泥': {'ni':1.0}, '了':{'liao':0.5, 'le':0.5}} """ data = {'default': 1.e-200, 'data': None} emission = readdatafromfile(BASE_EMISSION_FILE) for line in open('./hanzipinyin.txt'): line = util.as_text(line.strip()) hanzi, pinyin_list = line.split('=') pinyin_list = [util.simplify_pinyin(item.strip()) for item in pinyin_list.split(',')] char_list = [hanzi] * len(pinyin_list) for hanzi, pinyin in zip(char_list, pinyin_list): emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0.) emission[hanzi][pinyin] += 1. for hanzi in emission: num_sum = 0. for pinyin in emission[hanzi]: num_sum += emission[hanzi][pinyin] for pinyin in emission[hanzi]: emission[hanzi][pinyin] = emission[hanzi][pinyin] / num_sum data['data'] = emission writejson2file(data, FIN_EMISSION_FILE)
def gen_emission(): """ base_emission = {} #> {'泥': {'ni':1.0}, '了':{'liao':0.5, 'le':0.5}} """ data = {'default': 1.e-200, 'data': None} emission = readdatafromfile(BASE_EMISSION_FILE) for line in open('./hanzipinyin.txt'): line = util.as_text(line.strip()) hanzi, pinyin_list = line.split('=') pinyin_list = [ util.simplify_pinyin(item.strip()) for item in pinyin_list.split(',') ] char_list = [hanzi] * len(pinyin_list) for hanzi, pinyin in zip(char_list, pinyin_list): emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0.) emission[hanzi][pinyin] += 1. for hanzi in emission: num_sum = 0. for pinyin in emission[hanzi]: num_sum += emission[hanzi][pinyin] for pinyin in emission[hanzi]: emission[hanzi][pinyin] = emission[hanzi][pinyin] / num_sum data['data'] = emission writejson2file(data, FIN_EMISSION_FILE)
def process_hanzipinyin(emission): ## ./hanzipinyin.txt print('read from hanzipinyin.txt') for line in open(HANZI2PINYIN_FILE): line = util.as_text(line.strip()) if '=' not in line: continue hanzi, pinyins = line.split('=') pinyins = pinyins.split(',') pinyins = [util.simplify_pinyin(py) for py in pinyins] for pinyin in pinyins: emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0) emission[hanzi][pinyin] += 1
def process_hanzipinyin(emission): ## ./hanzipinyin.txt print('read from hanzipinyin.txt') for line in open(HANZI2PINYIN_FILE, 'r', encoding='utf8'): line = util.as_text(line.strip()) if '=' not in line: continue hanzi, pinyins = line.split('=') pinyins = pinyins.split(',') pinyins = [util.simplify_pinyin(py) for py in pinyins] for pinyin in pinyins: emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0) emission[hanzi][pinyin] += 1
def topinyin(s): """ s都是汉字 """ s = util.as_text(s) py_list = pypinyin.lazy_pinyin(s) result = [] for py in py_list: py = util.as_text(py) if py == '〇': result.append('ling') else: result.append(util.simplify_pinyin(py)) return result
def topinyin(s): """ s都是汉字 """ s = util.as_text(s) py_list = PinyinHelper.convertToPinyinFromSentence(s) result = [] for py in py_list: py = util.as_text(py) if py == '〇': result.append('ling') else: result.append(util.simplify_pinyin(py)) if ',' in ''.join(result): print(s) print(''.join(result)) sys.exit() return result
def readdatafromfile(filename): with open(filename) as outfile: return json.load(outfile) result = {} data = readdatafromfile(base_emission_file) max_num = 0. min_num = 100000000000000. for hanzi in data: for pinyin in data[hanzi]: pinyin = util.simplify_pinyin(pinyin) num = data[hanzi][pinyin] key = pinyin result.setdefault(key, {}) result[key].setdefault(hanzi, 0) result[key][hanzi] += num max_num = max(max_num, result[key][hanzi]) min_num = min(min_num, result[key][hanzi]) for line in open(pinyin2hanzi_file): line = util.as_text(line.strip()) if '=' not in line: continue pinyin, chars = line.split('=') if len(pinyin) == 0 or len(chars) == 0: continue
return json.load(outfile) result = {} max_num = 0. min_num = 100000000000000. for line in open('./word.txt'): line = util.as_text(line.strip()) if '=' not in line: continue word, num = line.split('=') num = float(num) pinyin_list = PinyinHelper.convertToPinyinFromSentence(word, segment=cut) pinyins = ','.join(pinyin_list) pinyins = util.simplify_pinyin(pinyins) result.setdefault(pinyins, {}) result[pinyins].setdefault(word, 0) result[pinyins][word] += num max_num = max(max_num, result[pinyins][word]) min_num = min(min_num, result[pinyins][word]) for line in open('./phrase.txt'): line = util.as_text(line.strip()) if '=' not in line: continue word, _ = line.split('=') num = 1. pinyin_list = PinyinHelper.convertToPinyinFromSentence(word, segment=cut) pinyins = ','.join(pinyin_list) pinyins = util.simplify_pinyin(pinyins)
SOURCE_FILE = './hanzipinyin.txt' ALL_STATES_FILE = './result/all_states.txt' # 汉字(隐藏状态) ALL_OBSERVATIONS_FILE = './result/all_observations.txt' # 拼音(观测值) PINYIN2HANZI_FILE = './result/pinyin2hanzi.txt' states = set() observations = set() py2hz = {} for line in open(SOURCE_FILE): line = util.as_text(line.strip()) hanzi, pinyin_list = line.split('=') pinyin_list = [ util.simplify_pinyin(item.strip()) for item in pinyin_list.split(',') ] states.add(hanzi) for pinyin in pinyin_list: observations.add(pinyin) py2hz.setdefault(pinyin, set()) py2hz[pinyin].add(hanzi) # 声母 shengmu = util.get_shengmu(pinyin) if shengmu is not None: py2hz.setdefault(shengmu, set()) py2hz[shengmu].add(hanzi) with open(ALL_STATES_FILE, 'w') as out:
SOURCE_FILE = './hanzipinyin.txt' ALL_STATES_FILE = './result/all_states.txt' # 汉字(隐藏状态) ALL_OBSERVATIONS_FILE = './result/all_observations.txt' # 拼音(观测值) PINYIN2HANZI_FILE = './result/pinyin2hanzi.txt' states = set() observations = set() py2hz = {} for line in open(SOURCE_FILE): line = util.as_text(line.strip()) hanzi, pinyin_list = line.split('=') pinyin_list = [util.simplify_pinyin(item.strip()) for item in pinyin_list.split(',')] states.add(hanzi) for pinyin in pinyin_list: observations.add(pinyin) py2hz.setdefault(pinyin, set()) py2hz[pinyin].add(hanzi) # 声母 shengmu = util.get_shengmu(pinyin) if shengmu is not None: py2hz.setdefault(shengmu, set()) py2hz[shengmu].add(hanzi) with open(ALL_STATES_FILE, 'w') as out: s = '\n'.join(states)