Ejemplo n.º 1
0
def process_hanzipinyin(emission):
    ## ./hanzipinyin.txt
    print('read from hanzipinyin.txt')
    with codecs.open(HANZI2PINYIN_FILE, 'r', 'utf-8') as fin:
        with codecs.open("data/dictionary.txt", 'w', 'utf-8') as fout:
            while True:
                line = fin.readline()
                if not line: break
                line = line.strip()
                if '=' not in line:
                    continue
                hanzi, pinyins = line.split('=')
                pinyins = pinyins.split(',')
                pinyins = [util.simplify_pinyin(py) for py in pinyins]
                pnyn = ""
                for i in range(len(pinyins)):
                    if i != len(pinyins) - 1:
                        pnyn += pinyins[i] + ","
                    else:
                        pnyn += pinyins[i]
                fout.write(u"{}\t{}\n".format(hanzi, pnyn))
                for pinyin in pinyins:
                    emission.setdefault(hanzi, {})
                    emission[hanzi].setdefault(pinyin, 0)
                    emission[hanzi][pinyin] += 1
Ejemplo n.º 2
0
def gen_emission():
    """
    base_emission   = {} #>   {'泥': {'ni':1.0}, '了':{'liao':0.5, 'le':0.5}}
    """
    data = {'default': 1.e-200, 'data': None}
    emission = readdatafromfile(BASE_EMISSION_FILE)

    for line in open(HZ2PY_FILE):
        line = util.as_text(line.strip())
        hanzi, pinyin_list = line.split('=')
        pinyin_list = [
            util.simplify_pinyin(item.strip())
            for item in pinyin_list.split(',')
        ]

        char_list = [hanzi] * len(pinyin_list)
        for hanzi, pinyin in zip(char_list, pinyin_list):
            emission.setdefault(hanzi, {})
            emission[hanzi].setdefault(pinyin, 0.)
            emission[hanzi][pinyin] += 1.

    for hanzi in emission:
        num_sum = 0.
        for pinyin in emission[hanzi]:
            num_sum += emission[hanzi][pinyin]
        for pinyin in emission[hanzi]:
            emission[hanzi][pinyin] = round(
                math.log(emission[hanzi][pinyin] / num_sum), 6)
    data['default'] = round(math.log(1.e-200), 6)
    data['data'] = emission
    writejson2file(data, FIN_EMISSION_FILE)
Ejemplo n.º 3
0
def process_hanzipinyin(emission):
    ## ./hanzipinyin.txt
    print('read from hanzipinyin.txt')
    for line in open(HANZI2PINYIN_FILE):
        line = util.as_text(line.strip())
        if '=' not in line:
            continue
        hanzi, pinyins = line.split('=')
        pinyins = pinyins.split(',')
        pinyins = [util.simplify_pinyin(py) for py in pinyins]
        for pinyin in pinyins:
            emission.setdefault(hanzi, {})
            emission[hanzi].setdefault(pinyin, 0)
            emission[hanzi][pinyin] += 1
Ejemplo n.º 4
0
def topinyin(s):
    """
    s都是汉字
    """
    s = util.as_text(s)
    py_list = pypinyin.lazy_pinyin(s)
    result = []
    for py in py_list:
        py = util.as_text(py)
        if py == '〇':
            result.append('ling')
        else:
            result.append(util.simplify_pinyin(py))

    return result
Ejemplo n.º 5
0
def topinyin(s):
    """
    s都是汉字
    """
    s = util.as_text(s)
    py_list = PinyinHelper.convertToPinyinFromSentence(s)
    result = []
    for py in py_list:
        py = util.as_text(py)
        if py == '〇':
            result.append('ling')
        else:
            result.append(util.simplify_pinyin(py))

    if ',' in ''.join(result):
        print(s)
        print(''.join(result))
        sys.exit()
    return result
Ejemplo n.º 6
0
ALL_STATES_FILE = 'data/all_states.txt'  # 汉字(隐藏状态)
ALL_OBSERVATIONS_FILE = 'data/all_observations.txt'  # 拼音(观测值)
PINYIN2HANZI_FILE = 'data/pinyin2hanzi.txt'

states = set()
observations = set()
py2hz = {}

with codecs.open(SOURCE_FILE, 'r', 'utf-8') as fin:
    while True:
        line = fin.readline().strip()
        if not line: break
        hanzi, pinyin_list = line.split('=')
        pinyin_list = [
            util.simplify_pinyin(item.strip())
            for item in pinyin_list.split(',')
        ]

        states.add(hanzi)

        for pinyin in pinyin_list:
            observations.add(pinyin)
            py2hz.setdefault(pinyin, set())
            py2hz[pinyin].add(hanzi)
            # 声母
            shengmu = util.get_shengmu(pinyin)
            if shengmu is not None:
                py2hz.setdefault(shengmu, set())
                py2hz[shengmu].add(hanzi)
Ejemplo n.º 7
0
SOURCE_FILE = '../data/train/original/hanzipinyin.txt'

ALL_STATES_FILE = '../data/train/result/all_states.txt'  # 汉字(隐藏状态)
ALL_OBSERVATIONS_FILE = '../data/train/result/all_observations.txt'  # 拼音(观测值)
PINYIN2HANZI_FILE = '../data/train/result/pinyin2hanzi.txt'

states = set()
observations = set()
py2hz = {}

for line in open(SOURCE_FILE):
    line = util.as_text(line.strip())
    hanzi, pinyin_list = line.split('=')
    pinyin_list = [
        util.simplify_pinyin(item.strip()) for item in pinyin_list.split(',')
    ]

    states.add(hanzi)

    for pinyin in pinyin_list:
        observations.add(pinyin)
        py2hz.setdefault(pinyin, set())
        py2hz[pinyin].add(hanzi)
        # 声母
        shengmu = util.get_shengmu(pinyin)
        if shengmu is not None:
            py2hz.setdefault(shengmu, set())
            py2hz[shengmu].add(hanzi)

with open(ALL_STATES_FILE, 'w') as out: