Exemple #1
0
def read_from_sentence_txt(start, emission, transition):
    ## ./result/sentence.txt
    print('read from sentence.txt')
    for line in open(SENTENCE_FILE):
        line = util.as_text(line.strip())
        if len(line) < 2:
            continue
        if not util.is_chinese(line):
            continue

        ## for start
        start.setdefault(line[0], 0)
        start[line[0]] += 1
        
        ## for emission
        pinyin_list = topinyin(line)
        char_list = [c for c in line]

        for hanzi, pinyin in zip(char_list, pinyin_list):
            emission.setdefault(hanzi, {})
            emission[hanzi].setdefault(pinyin, 0)
            emission[hanzi][pinyin] += 1


        ## for transition
        for f, t in zip(line[:-1], line[1:]):
            transition.setdefault(f, {})
            transition[f].setdefault(t, 0)
            transition[f][t] += 1
def read_from_sentence_txt(start, emission, transition):
    ## ./result/sentence.txt
    print('read from sentence.txt')
    for line in open(SENTENCE_FILE, encoding='utf8'):
        line = util.as_text(line.strip())
        if len(line) < 2:
            continue
        if not util.is_chinese(line):
            continue

        ## for start
        start.setdefault(line[0], 0)
        start[line[0]] += 1

        ## for emission
        pinyin_list = topinyin(line)
        char_list = [c for c in line]

        for hanzi, pinyin in zip(char_list, pinyin_list):
            emission.setdefault(hanzi, {})
            emission[hanzi].setdefault(pinyin, 0)
            emission[hanzi][pinyin] += 1

        ## for transition
        for f, t in zip(line[:-1], line[1:]):
            transition.setdefault(f, {})
            transition[f].setdefault(t, 0)
            transition[f][t] += 1
Exemple #3
0
def extract_chinese_sentences(content):
    content = util.as_text(content)
    content = content.replace(' ', '')
    content = content.replace('\t', '')
    sentences = []
    s = ''
    for c in content:
        if util.is_chinese(c):
            s += c
        else:
            sentences.append(s)
            s = ''
    sentences.append(s)

    return [s.strip() for s in sentences if len(s.strip()) > 1]
def extract_chinese_sentences(content):
    content = util.as_text(content)
    content = content.replace(' ', '')
    content = content.replace('\t', '')
    sentences = []
    s = ''
    for c in content:
        if util.is_chinese(c):
            s += c
        else:
            sentences.append(s)
            s = ''
    sentences.append(s)

    return [s.strip() for s in sentences if len(s.strip()) > 1]
Exemple #5
0
def read_from_word_txt(start, emission, transition):
    ## ! 基于word.txt的优化
    print('read from word.txt')
    _base = 1000.
    _min_value = 2.
    for line in open(WORD_FILE):
        line = util.as_text(line.strip())
        if '=' not in line:
            continue
        if len(line) < 3:
            continue
        ls = line.split('=')
        if len(ls) != 2:
            continue
        word, num = ls
        word = word.strip()
        num  = num.strip()
        if len(num) == 0:
            continue
        num = float(num)
        num = max(_min_value, num/_base)
        
        if not util.is_chinese(word):
            continue

        ## for start
        start.setdefault(word[0], 0)
        start[word[0]] += num

        ## for emission
        pinyin_list = topinyin(word)
        char_list = [c for c in word]
        for hanzi, pinyin in zip(char_list, pinyin_list):
            emission.setdefault(hanzi, {})
            emission[hanzi].setdefault(pinyin, 0)
            emission[hanzi][pinyin] += num

        ## for transition
        for f, t in zip(word[:-1], word[1:]):
            transition.setdefault(f, {})
            transition[f].setdefault(t, 0)
            transition[f][t] += num
def read_from_word_txt(start, emission, transition):
    ## ! 基于word.txt的优化
    print('read from word.txt')
    _base = 1000.
    _min_value = 2.
    for line in open(WORD_FILE, 'rb'):
        line = util.as_text(line.strip())
        if '=' not in line:
            continue
        if len(line) < 3:
            continue
        ls = line.split('=')
        if len(ls) != 2:
            continue
        word, num = ls
        word = word.strip()
        num = num.strip()
        if len(num) == 0:
            continue
        num = float(num)
        num = max(_min_value, num / _base)

        if not util.is_chinese(word):
            continue

        ## for start
        start.setdefault(word[0], 0)
        start[word[0]] += num

        ## for emission
        pinyin_list = topinyin(word)
        char_list = [c for c in word]
        for hanzi, pinyin in zip(char_list, pinyin_list):
            emission.setdefault(hanzi, {})
            emission[hanzi].setdefault(pinyin, 0)
            emission[hanzi][pinyin] += num

        ## for transition
        for f, t in zip(word[:-1], word[1:]):
            transition.setdefault(f, {})
            transition[f].setdefault(t, 0)
            transition[f][t] += num