def read_from_sentence_txt(start, emission, transition): ## ./result/sentence.txt print('read from sentence.txt') for line in open(SENTENCE_FILE): line = util.as_text(line.strip()) if len(line) < 2: continue if not util.is_chinese(line): continue ## for start start.setdefault(line[0], 0) start[line[0]] += 1 ## for emission pinyin_list = topinyin(line) char_list = [c for c in line] for hanzi, pinyin in zip(char_list, pinyin_list): emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0) emission[hanzi][pinyin] += 1 ## for transition for f, t in zip(line[:-1], line[1:]): transition.setdefault(f, {}) transition[f].setdefault(t, 0) transition[f][t] += 1
def read_from_sentence_txt(start, emission, transition): ## ./result/sentence.txt print('read from sentence.txt') for line in open(SENTENCE_FILE, encoding='utf8'): line = util.as_text(line.strip()) if len(line) < 2: continue if not util.is_chinese(line): continue ## for start start.setdefault(line[0], 0) start[line[0]] += 1 ## for emission pinyin_list = topinyin(line) char_list = [c for c in line] for hanzi, pinyin in zip(char_list, pinyin_list): emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0) emission[hanzi][pinyin] += 1 ## for transition for f, t in zip(line[:-1], line[1:]): transition.setdefault(f, {}) transition[f].setdefault(t, 0) transition[f][t] += 1
def extract_chinese_sentences(content): content = util.as_text(content) content = content.replace(' ', '') content = content.replace('\t', '') sentences = [] s = '' for c in content: if util.is_chinese(c): s += c else: sentences.append(s) s = '' sentences.append(s) return [s.strip() for s in sentences if len(s.strip()) > 1]
def read_from_word_txt(start, emission, transition): ## ! 基于word.txt的优化 print('read from word.txt') _base = 1000. _min_value = 2. for line in open(WORD_FILE): line = util.as_text(line.strip()) if '=' not in line: continue if len(line) < 3: continue ls = line.split('=') if len(ls) != 2: continue word, num = ls word = word.strip() num = num.strip() if len(num) == 0: continue num = float(num) num = max(_min_value, num/_base) if not util.is_chinese(word): continue ## for start start.setdefault(word[0], 0) start[word[0]] += num ## for emission pinyin_list = topinyin(word) char_list = [c for c in word] for hanzi, pinyin in zip(char_list, pinyin_list): emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0) emission[hanzi][pinyin] += num ## for transition for f, t in zip(word[:-1], word[1:]): transition.setdefault(f, {}) transition[f].setdefault(t, 0) transition[f][t] += num
def read_from_word_txt(start, emission, transition): ## ! 基于word.txt的优化 print('read from word.txt') _base = 1000. _min_value = 2. for line in open(WORD_FILE, 'rb'): line = util.as_text(line.strip()) if '=' not in line: continue if len(line) < 3: continue ls = line.split('=') if len(ls) != 2: continue word, num = ls word = word.strip() num = num.strip() if len(num) == 0: continue num = float(num) num = max(_min_value, num / _base) if not util.is_chinese(word): continue ## for start start.setdefault(word[0], 0) start[word[0]] += num ## for emission pinyin_list = topinyin(word) char_list = [c for c in word] for hanzi, pinyin in zip(char_list, pinyin_list): emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0) emission[hanzi][pinyin] += num ## for transition for f, t in zip(word[:-1], word[1:]): transition.setdefault(f, {}) transition[f].setdefault(t, 0) transition[f][t] += num