Ejemplo n.º 1
0
def gen_emission():
    """
    base_emission   = {} #>   {'泥': {'ni':1.0}, '了':{'liao':0.5, 'le':0.5}}
    """
    data = {'default': 1.e-200, 'data': None}
    emission = readdatafromfile(BASE_EMISSION_FILE)


    for line in open('./hanzipinyin.txt'):
        line = util.as_text(line.strip())
        hanzi, pinyin_list = line.split('=')
        pinyin_list = [util.simplify_pinyin(item.strip()) for item in pinyin_list.split(',')]

        char_list = [hanzi] * len(pinyin_list)
        for hanzi, pinyin in zip(char_list, pinyin_list):
            emission.setdefault(hanzi, {})
            emission[hanzi].setdefault(pinyin, 0.)
            emission[hanzi][pinyin] += 1.

    for hanzi in emission:
        num_sum = 0.
        for pinyin in emission[hanzi]:
            num_sum += emission[hanzi][pinyin]
        for pinyin in emission[hanzi]:
            emission[hanzi][pinyin] = emission[hanzi][pinyin] / num_sum

    data['data'] = emission
    writejson2file(data, FIN_EMISSION_FILE)
Ejemplo n.º 2
0
def gen_emission():
    """
    base_emission   = {} #>   {'泥': {'ni':1.0}, '了':{'liao':0.5, 'le':0.5}}
    """
    data = {'default': 1.e-200, 'data': None}
    emission = readdatafromfile(BASE_EMISSION_FILE)

    for line in open('./hanzipinyin.txt'):
        line = util.as_text(line.strip())
        hanzi, pinyin_list = line.split('=')
        pinyin_list = [
            util.simplify_pinyin(item.strip())
            for item in pinyin_list.split(',')
        ]

        char_list = [hanzi] * len(pinyin_list)
        for hanzi, pinyin in zip(char_list, pinyin_list):
            emission.setdefault(hanzi, {})
            emission[hanzi].setdefault(pinyin, 0.)
            emission[hanzi][pinyin] += 1.

    for hanzi in emission:
        num_sum = 0.
        for pinyin in emission[hanzi]:
            num_sum += emission[hanzi][pinyin]
        for pinyin in emission[hanzi]:
            emission[hanzi][pinyin] = emission[hanzi][pinyin] / num_sum

    data['data'] = emission
    writejson2file(data, FIN_EMISSION_FILE)
Ejemplo n.º 3
0
def process_hanzipinyin(emission):
    ## ./hanzipinyin.txt
    print('read from hanzipinyin.txt')
    for line in open(HANZI2PINYIN_FILE):
        line = util.as_text(line.strip())
        if '=' not in line:
            continue
        hanzi, pinyins = line.split('=')
        pinyins = pinyins.split(',')
        pinyins = [util.simplify_pinyin(py) for py in pinyins]
        for pinyin in pinyins:
            emission.setdefault(hanzi, {})
            emission[hanzi].setdefault(pinyin, 0)
            emission[hanzi][pinyin] += 1
Ejemplo n.º 4
0
def process_hanzipinyin(emission):
    ## ./hanzipinyin.txt
    print('read from hanzipinyin.txt')
    for line in open(HANZI2PINYIN_FILE, 'r', encoding='utf8'):
        line = util.as_text(line.strip())
        if '=' not in line:
            continue
        hanzi, pinyins = line.split('=')
        pinyins = pinyins.split(',')
        pinyins = [util.simplify_pinyin(py) for py in pinyins]
        for pinyin in pinyins:
            emission.setdefault(hanzi, {})
            emission[hanzi].setdefault(pinyin, 0)
            emission[hanzi][pinyin] += 1
Ejemplo n.º 5
0
def topinyin(s):
    """
    s都是汉字
    """
    s = util.as_text(s)
    py_list = pypinyin.lazy_pinyin(s)
    result = []
    for py in py_list:
        py = util.as_text(py)
        if py == '〇':
            result.append('ling')
        else:
            result.append(util.simplify_pinyin(py))

    return result
Ejemplo n.º 6
0
def topinyin(s):
    """
    s都是汉字
    """
    s = util.as_text(s)
    py_list = PinyinHelper.convertToPinyinFromSentence(s)
    result = []
    for py in py_list:
        py = util.as_text(py)
        if py == '〇':
            result.append('ling')
        else:
            result.append(util.simplify_pinyin(py))

    if ',' in ''.join(result):
        print(s)
        print(''.join(result))
        sys.exit()
    return result
Ejemplo n.º 7
0
def topinyin(s):
    """
    s都是汉字
    """
    s = util.as_text(s)
    py_list = PinyinHelper.convertToPinyinFromSentence(s)
    result = []
    for py in py_list:
        py = util.as_text(py)
        if py == '〇':
            result.append('ling')
        else:
            result.append(util.simplify_pinyin(py))

    if ',' in ''.join(result):
        print(s)
        print(''.join(result))
        sys.exit()
    return result
Ejemplo n.º 8
0
def readdatafromfile(filename):
    with open(filename) as outfile:
        return json.load(outfile)


result = {}

data = readdatafromfile(base_emission_file)

max_num = 0.
min_num = 100000000000000.

for hanzi in data:
    for pinyin in data[hanzi]:
        pinyin = util.simplify_pinyin(pinyin)
        num = data[hanzi][pinyin]
        key = pinyin
        result.setdefault(key, {})
        result[key].setdefault(hanzi, 0)
        result[key][hanzi] += num
        max_num = max(max_num, result[key][hanzi])
        min_num = min(min_num, result[key][hanzi])

for line in open(pinyin2hanzi_file):
    line = util.as_text(line.strip())
    if '=' not in line:
        continue
    pinyin, chars = line.split('=')
    if len(pinyin) == 0 or len(chars) == 0:
        continue
Ejemplo n.º 9
0
def readdatafromfile(filename):
    with open(filename) as outfile:
        return json.load(outfile)


result = {}

data = readdatafromfile(base_emission_file)

max_num = 0.
min_num = 100000000000000.

for hanzi in data:
    for pinyin in data[hanzi]:
        pinyin = util.simplify_pinyin(pinyin)
        num = data[hanzi][pinyin]
        key = pinyin
        result.setdefault(key, {})
        result[key].setdefault(hanzi, 0)
        result[key][hanzi] += num
        max_num = max(max_num, result[key][hanzi])
        min_num = min(min_num, result[key][hanzi])

for line in open(pinyin2hanzi_file):
    line = util.as_text(line.strip())
    if '=' not in line:
        continue
    pinyin, chars = line.split('=')
    if len(pinyin) == 0 or len(chars) == 0:
        continue
Ejemplo n.º 10
0
        return json.load(outfile)


result = {}
max_num = 0.
min_num = 100000000000000.

for line in open('./word.txt'):
    line = util.as_text(line.strip())
    if '=' not in line:
        continue
    word, num = line.split('=')
    num = float(num)
    pinyin_list = PinyinHelper.convertToPinyinFromSentence(word, segment=cut)
    pinyins = ','.join(pinyin_list)
    pinyins = util.simplify_pinyin(pinyins)
    result.setdefault(pinyins, {})
    result[pinyins].setdefault(word, 0)
    result[pinyins][word] += num
    max_num = max(max_num, result[pinyins][word])
    min_num = min(min_num, result[pinyins][word])

for line in open('./phrase.txt'):
    line = util.as_text(line.strip())
    if '=' not in line:
        continue
    word, _ = line.split('=')
    num = 1.
    pinyin_list = PinyinHelper.convertToPinyinFromSentence(word, segment=cut)
    pinyins = ','.join(pinyin_list)
    pinyins = util.simplify_pinyin(pinyins)
Ejemplo n.º 11
0
SOURCE_FILE = './hanzipinyin.txt'

ALL_STATES_FILE = './result/all_states.txt'  # 汉字(隐藏状态)
ALL_OBSERVATIONS_FILE = './result/all_observations.txt'  # 拼音(观测值)
PINYIN2HANZI_FILE = './result/pinyin2hanzi.txt'

states = set()
observations = set()
py2hz = {}

for line in open(SOURCE_FILE):
    line = util.as_text(line.strip())
    hanzi, pinyin_list = line.split('=')
    pinyin_list = [
        util.simplify_pinyin(item.strip()) for item in pinyin_list.split(',')
    ]

    states.add(hanzi)

    for pinyin in pinyin_list:
        observations.add(pinyin)
        py2hz.setdefault(pinyin, set())
        py2hz[pinyin].add(hanzi)
        # 声母
        shengmu = util.get_shengmu(pinyin)
        if shengmu is not None:
            py2hz.setdefault(shengmu, set())
            py2hz[shengmu].add(hanzi)

with open(ALL_STATES_FILE, 'w') as out:
Ejemplo n.º 12
0
        return json.load(outfile)

result = {}
max_num = 0.
min_num = 100000000000000.


for line in open('./word.txt'):
    line = util.as_text(line.strip())
    if '=' not in line:
        continue
    word, num = line.split('=')
    num = float(num)
    pinyin_list = PinyinHelper.convertToPinyinFromSentence(word, segment=cut)
    pinyins = ','.join(pinyin_list)
    pinyins = util.simplify_pinyin(pinyins)
    result.setdefault(pinyins, {})
    result[pinyins].setdefault(word, 0)
    result[pinyins][word] += num
    max_num = max(max_num, result[pinyins][word])
    min_num = min(min_num, result[pinyins][word])

for line in open('./phrase.txt'):
    line = util.as_text(line.strip())
    if '=' not in line:
        continue
    word, _ = line.split('=')
    num = 1.
    pinyin_list = PinyinHelper.convertToPinyinFromSentence(word, segment=cut)
    pinyins = ','.join(pinyin_list)
    pinyins = util.simplify_pinyin(pinyins)
Ejemplo n.º 13
0
SOURCE_FILE           = './hanzipinyin.txt'

ALL_STATES_FILE       = './result/all_states.txt'         # 汉字(隐藏状态)
ALL_OBSERVATIONS_FILE = './result/all_observations.txt'   # 拼音(观测值)
PINYIN2HANZI_FILE     = './result/pinyin2hanzi.txt'       

states = set()
observations = set()
py2hz = {}


for line in open(SOURCE_FILE):
    line = util.as_text(line.strip())
    hanzi, pinyin_list = line.split('=')
    pinyin_list = [util.simplify_pinyin(item.strip()) for item in pinyin_list.split(',')]

    states.add(hanzi)
    
    for pinyin in pinyin_list:
        observations.add(pinyin)
        py2hz.setdefault(pinyin, set())
        py2hz[pinyin].add(hanzi)
        # 声母
        shengmu = util.get_shengmu(pinyin)
        if shengmu is not None:
            py2hz.setdefault(shengmu, set())
            py2hz[shengmu].add(hanzi) 

with open(ALL_STATES_FILE, 'w') as out:
    s = '\n'.join(states)