Ejemplo n.º 1
0
def topinyin(s):
    """
    s都是汉字
    """
    s = util.as_text(s)
    py_list = pypinyin.lazy_pinyin(s)
    result = []
    for py in py_list:
        py = util.as_text(py)
        if py == '〇':
            result.append('ling')
        else:
            result.append(util.simplify_pinyin(py))

    return result
Ejemplo n.º 2
0
def gen_emission():
    """
    base_emission   = {} #>   {'泥': {'ni':1.0}, '了':{'liao':0.5, 'le':0.5}}
    """
    data = {'default': 1.e-200, 'data': None}
    emission = readdatafromfile(BASE_EMISSION_FILE)

    for line in open('./hanzipinyin.txt'):
        line = util.as_text(line.strip())
        hanzi, pinyin_list = line.split('=')
        pinyin_list = [
            util.simplify_pinyin(item.strip())
            for item in pinyin_list.split(',')
        ]

        char_list = [hanzi] * len(pinyin_list)
        for hanzi, pinyin in zip(char_list, pinyin_list):
            emission.setdefault(hanzi, {})
            emission[hanzi].setdefault(pinyin, 0.)
            emission[hanzi][pinyin] += 1.

    for hanzi in emission:
        num_sum = 0.
        for pinyin in emission[hanzi]:
            num_sum += emission[hanzi][pinyin]
        for pinyin in emission[hanzi]:
            emission[hanzi][pinyin] = emission[hanzi][pinyin] / num_sum

    data['data'] = emission
    writejson2file(data, FIN_EMISSION_FILE)
Ejemplo n.º 3
0
def read_from_sentence_txt(start, emission, transition):
    ## ./result/sentence.txt
    print('read from sentence.txt')
    for line in open(SENTENCE_FILE, encoding='utf8'):
        line = util.as_text(line.strip())
        if len(line) < 2:
            continue
        if not util.is_chinese(line):
            continue

        ## for start
        start.setdefault(line[0], 0)
        start[line[0]] += 1

        ## for emission
        pinyin_list = topinyin(line)
        char_list = [c for c in line]

        for hanzi, pinyin in zip(char_list, pinyin_list):
            emission.setdefault(hanzi, {})
            emission[hanzi].setdefault(pinyin, 0)
            emission[hanzi][pinyin] += 1

        ## for transition
        for f, t in zip(line[:-1], line[1:]):
            transition.setdefault(f, {})
            transition[f].setdefault(t, 0)
            transition[f][t] += 1
Ejemplo n.º 4
0
def gen_emission():
    """
    base_emission   = {} #>   {'泥': {'ni':1.0}, '了':{'liao':0.5, 'le':0.5}}
    """
    data = {'default': 1.e-200, 'data': None}
    emission = readdatafromfile(BASE_EMISSION_FILE)


    for line in open('./hanzipinyin.txt'):
        line = util.as_text(line.strip())
        hanzi, pinyin_list = line.split('=')
        pinyin_list = [util.simplify_pinyin(item.strip()) for item in pinyin_list.split(',')]

        char_list = [hanzi] * len(pinyin_list)
        for hanzi, pinyin in zip(char_list, pinyin_list):
            emission.setdefault(hanzi, {})
            emission[hanzi].setdefault(pinyin, 0.)
            emission[hanzi][pinyin] += 1.

    for hanzi in emission:
        num_sum = 0.
        for pinyin in emission[hanzi]:
            num_sum += emission[hanzi][pinyin]
        for pinyin in emission[hanzi]:
            emission[hanzi][pinyin] = emission[hanzi][pinyin] / num_sum

    data['data'] = emission
    writejson2file(data, FIN_EMISSION_FILE)
Ejemplo n.º 5
0
def read_from_sentence_txt(start, emission, transition):
    ## ./result/sentence.txt
    print('read from sentence.txt')
    for line in open(SENTENCE_FILE):
        line = util.as_text(line.strip())
        if len(line) < 2:
            continue
        if not util.is_chinese(line):
            continue

        ## for start
        start.setdefault(line[0], 0)
        start[line[0]] += 1
        
        ## for emission
        pinyin_list = topinyin(line)
        char_list = [c for c in line]

        for hanzi, pinyin in zip(char_list, pinyin_list):
            emission.setdefault(hanzi, {})
            emission[hanzi].setdefault(pinyin, 0)
            emission[hanzi][pinyin] += 1


        ## for transition
        for f, t in zip(line[:-1], line[1:]):
            transition.setdefault(f, {})
            transition[f].setdefault(t, 0)
            transition[f][t] += 1
Ejemplo n.º 6
0
def topinyin(s):
    """
    s都是汉字
    """
    s = util.as_text(s)
    py_list = PinyinHelper.convertToPinyinFromSentence(s)
    result = []
    for py in py_list:
        py = util.as_text(py)
        if py == '〇':
            result.append('ling')
        else:
            result.append(util.simplify_pinyin(py))

    if ',' in ''.join(result):
        print(s)
        print(''.join(result))
        sys.exit()
    return result
Ejemplo n.º 7
0
def topinyin(s):
    """
    s都是汉字
    """
    s = util.as_text(s)
    py_list = PinyinHelper.convertToPinyinFromSentence(s)
    result = []
    for py in py_list:
        py = util.as_text(py)
        if py == '〇':
            result.append('ling')
        else:
            result.append(util.simplify_pinyin(py))

    if ',' in ''.join(result):
        print(s)
        print(''.join(result))
        sys.exit()
    return result
Ejemplo n.º 8
0
def process_hanzipinyin(emission):
    ## ./hanzipinyin.txt
    print('read from hanzipinyin.txt')
    for line in open(HANZI2PINYIN_FILE, 'r', encoding='utf8'):
        line = util.as_text(line.strip())
        if '=' not in line:
            continue
        hanzi, pinyins = line.split('=')
        pinyins = pinyins.split(',')
        pinyins = [util.simplify_pinyin(py) for py in pinyins]
        for pinyin in pinyins:
            emission.setdefault(hanzi, {})
            emission[hanzi].setdefault(pinyin, 0)
            emission[hanzi][pinyin] += 1
Ejemplo n.º 9
0
def gen_py2hz():
    data = {}
    for line in open(PY2HZ_FILE):
        line = util.as_text(line.strip())
        ls = line.split('=')
        if len(ls) != 2:
            raise Exception('invalid format')
        py, chars = ls
        py = py.strip()
        chars = chars.strip()
        if len(py) > 0 and len(chars) > 0:
            data[py] = chars

    writejson2file(data, FIN_PY2HZ_FILE)
Ejemplo n.º 10
0
def process_hanzipinyin(emission):
    ## ./hanzipinyin.txt
    print('read from hanzipinyin.txt')
    for line in open(HANZI2PINYIN_FILE):
        line = util.as_text(line.strip())
        if '=' not in line:
            continue
        hanzi, pinyins = line.split('=')
        pinyins = pinyins.split(',')
        pinyins = [util.simplify_pinyin(py) for py in pinyins]
        for pinyin in pinyins:
            emission.setdefault(hanzi, {})
            emission[hanzi].setdefault(pinyin, 0)
            emission[hanzi][pinyin] += 1
Ejemplo n.º 11
0
def gen_py2hz():
    data = {}
    for line in open(PY2HZ_FILE):
        line = util.as_text(line.strip())
        ls = line.split('=')
        if len(ls) != 2:
            raise Exception('invalid format')
        py, chars = ls
        py = py.strip()
        chars = chars.strip()
        if len(py)>0 and len(chars)>0:
            data[py] = chars

    writejson2file(data, FIN_PY2HZ_FILE)
Ejemplo n.º 12
0
def extract_chinese_sentences(content):
    content = util.as_text(content)
    content = content.replace(' ', '')
    content = content.replace('\t', '')
    sentences = []
    s = ''
    for c in content:
        if util.is_chinese(c):
            s += c
        else:
            sentences.append(s)
            s = ''
    sentences.append(s)

    return [s.strip() for s in sentences if len(s.strip()) > 1]
Ejemplo n.º 13
0
def extract_chinese_sentences(content):
    content = util.as_text(content)
    content = content.replace(' ', '')
    content = content.replace('\t', '')
    sentences = []
    s = ''
    for c in content:
        if util.is_chinese(c):
            s += c
        else:
            sentences.append(s)
            s = ''
    sentences.append(s)

    return [s.strip() for s in sentences if len(s.strip()) > 1]
Ejemplo n.º 14
0
def read_from_word_txt(start, emission, transition):
    ## ! 基于word.txt的优化
    print('read from word.txt')
    _base = 1000.
    _min_value = 2.
    for line in open(WORD_FILE, 'rb'):
        line = util.as_text(line.strip())
        if '=' not in line:
            continue
        if len(line) < 3:
            continue
        ls = line.split('=')
        if len(ls) != 2:
            continue
        word, num = ls
        word = word.strip()
        num = num.strip()
        if len(num) == 0:
            continue
        num = float(num)
        num = max(_min_value, num / _base)

        if not util.is_chinese(word):
            continue

        ## for start
        start.setdefault(word[0], 0)
        start[word[0]] += num

        ## for emission
        pinyin_list = topinyin(word)
        char_list = [c for c in word]
        for hanzi, pinyin in zip(char_list, pinyin_list):
            emission.setdefault(hanzi, {})
            emission[hanzi].setdefault(pinyin, 0)
            emission[hanzi][pinyin] += num

        ## for transition
        for f, t in zip(word[:-1], word[1:]):
            transition.setdefault(f, {})
            transition[f].setdefault(t, 0)
            transition[f][t] += num
Ejemplo n.º 15
0
def read_from_word_txt(start, emission, transition):
    ## ! 基于word.txt的优化
    print('read from word.txt')
    _base = 1000.
    _min_value = 2.
    for line in open(WORD_FILE):
        line = util.as_text(line.strip())
        if '=' not in line:
            continue
        if len(line) < 3:
            continue
        ls = line.split('=')
        if len(ls) != 2:
            continue
        word, num = ls
        word = word.strip()
        num  = num.strip()
        if len(num) == 0:
            continue
        num = float(num)
        num = max(_min_value, num/_base)
        
        if not util.is_chinese(word):
            continue

        ## for start
        start.setdefault(word[0], 0)
        start[word[0]] += num

        ## for emission
        pinyin_list = topinyin(word)
        char_list = [c for c in word]
        for hanzi, pinyin in zip(char_list, pinyin_list):
            emission.setdefault(hanzi, {})
            emission[hanzi].setdefault(pinyin, 0)
            emission[hanzi][pinyin] += num

        ## for transition
        for f, t in zip(word[:-1], word[1:]):
            transition.setdefault(f, {})
            transition[f].setdefault(t, 0)
            transition[f][t] += num
Ejemplo n.º 16
0
max_num = 0.
min_num = 100000000000000.

for hanzi in data:
    for pinyin in data[hanzi]:
        pinyin = util.simplify_pinyin(pinyin)
        num = data[hanzi][pinyin]
        key = pinyin
        result.setdefault(key, {})
        result[key].setdefault(hanzi, 0)
        result[key][hanzi] += num
        max_num = max(max_num, result[key][hanzi])
        min_num = min(min_num, result[key][hanzi])

for line in open(pinyin2hanzi_file):
    line = util.as_text(line.strip())
    if '=' not in line:
        continue
    pinyin, chars = line.split('=')
    if len(pinyin) == 0 or len(chars) == 0:
        continue

    pinyin = util.simplify_pinyin(pinyin)

    for hanzi in chars:
        key = pinyin
        result.setdefault(key, {})
        result[key].setdefault(hanzi, 0)
        result[key][hanzi] += 1.
        max_num = max(max_num, result[key][hanzi])
        min_num = min(min_num, result[key][hanzi])
Ejemplo n.º 17
0
max_num = 0.
min_num = 100000000000000.

for hanzi in data:
    for pinyin in data[hanzi]:
        pinyin = util.simplify_pinyin(pinyin)
        num = data[hanzi][pinyin]
        key = pinyin
        result.setdefault(key, {})
        result[key].setdefault(hanzi, 0)
        result[key][hanzi] += num
        max_num = max(max_num, result[key][hanzi])
        min_num = min(min_num, result[key][hanzi])

for line in open(pinyin2hanzi_file):
    line = util.as_text(line.strip())
    if '=' not in line:
        continue
    pinyin, chars = line.split('=')
    if len(pinyin) == 0 or len(chars) == 0:
        continue

    pinyin = util.simplify_pinyin(pinyin)

    for hanzi in chars:
        key = pinyin
        result.setdefault(key, {})
        result[key].setdefault(hanzi, 0)
        result[key][hanzi] += 1.
        max_num = max(max_num, result[key][hanzi])
        min_num = min(min_num, result[key][hanzi])
Ejemplo n.º 18
0
def read_from_all_pinyin(all_pinyin):
    print("begin read from all_pinyin.txt")
    for line in open(PINYIN_FILE, encoding='utf8'):
        line = util.as_text(line.strip())
        all_pinyin.append(line)