def update_word_full(char_phones: Dict[str, List[str]]): to_update_items = [] for item in WordPhoneTable.select(): words: str = item.word full: str = item.full if len(words) == len(full.split(' ')): continue if full == '': full = ' '.join(get_full(words)) item.full = full to_update_items.append(item) continue words_candidate_fulls: List[List[str]] = [] for char in words: if char not in char_phones: print(f"{char} not in phone table") continue # FIXME: # raise RuntimeError(f"{char} in phone table") else: words_candidate_fulls.append( sorted(char_phones[char], key=lambda e: -len(e))) full_arr: List[Tuple[List[str], str]] = [] for word_candidate_fulls in words_candidate_fulls: if len(full_arr) <= 0: # 第一个字 for candidate_full in word_candidate_fulls: if full.startswith(candidate_full): full_arr.append( ([candidate_full], full[len(candidate_full):])) else: broken_segments = [] this_full_arr: List[Tuple[List[str], str]] = [] for pre_segment in full_arr: next_full = pre_segment[1] for candidate_full in word_candidate_fulls: if next_full.startswith(candidate_full): this_segments = [] this_segments.extend(pre_segment[0]) this_segments.append(candidate_full) this_next_full = next_full[len(candidate_full):] this_full_arr.append( (this_segments, this_next_full)) full_arr = this_full_arr full_arr = [e for e in full_arr if len(e[0]) > 0 and e[1] == ''] if len(full_arr) != 1: print(f"wrong format: {item}, {full_arr}") # FIXME: # raise RuntimeError(f"get full pinyin fails, {item}") else: item.full = ' '.join(full_arr[0][0]) to_update_items.append(item) if len(to_update_items) > 0: print(f"total have {len(to_update_items)} items to update") with db.atomic(): WordPhoneTable.bulk_update(to_update_items, fields=['full'], batch_size=100) print("done")
def check_wordphonetable_pinyin(transformer: Dict[str, str], schema: ShuangPinSchema): to_update_items = [] for item in WordPhoneTable.select(): fulls = item.full if schema == XHE_SP_SCHEMA: shuangpin = item.xhe elif schema == LU_SP_SCHEMA: shuangpin = item.lu elif schema == ZRM_SP_SCHEMA: shuangpin = item.zrm elif schema == BINGJI_SP_SCHEMA: shuangpin = item.bingji else: raise RuntimeError(f'unknown schema: {schema}') full_shuangpins_arr = [] for full in fulls.split(' '): s, y = split_sy(full) sp = transformer[s] + transformer[y] full_shuangpins_arr.append(sp) full_shuangpins = ''.join(full_shuangpins_arr) if full_shuangpins != shuangpin: if schema == XHE_SP_SCHEMA: item.xhe = full_shuangpins elif schema == LU_SP_SCHEMA: item.lu = full_shuangpins elif schema == ZRM_SP_SCHEMA: item.zrm = full_shuangpins elif schema == BINGJI_SP_SCHEMA: item.bingji = full_shuangpins else: raise RuntimeError(f'unknown schema: {schema}') to_update_items.append(item) with db.atomic(): if schema == XHE_SP_SCHEMA: WordPhoneTable.bulk_update(to_update_items, fields=['xhe'], batch_size=100) elif schema == LU_SP_SCHEMA: WordPhoneTable.bulk_update(to_update_items, fields=['lu'], batch_size=100) elif schema == ZRM_SP_SCHEMA: WordPhoneTable.bulk_update(to_update_items, fields=['zrm'], batch_size=100) elif schema == BINGJI_SP_SCHEMA: WordPhoneTable.bulk_update(to_update_items, fields=['bingji'], batch_size=100) else: raise RuntimeError(f'unknown schema: {schema}') print(to_update_items) print(f'update {len(to_update_items)} wordphonetable items')
def cols_to_word_phone_table(cols: List[str], xhe_transformer, zrm_transformer) -> WordPhoneTable: if len(cols) == 1: word = cols[0] priority = 1 full = get_full(word) elif len(cols) == 2: word = cols[0] priority = cols[1] full = get_full(word) elif len(cols) == 2 + len(cols[0]): word = cols[0] priority = cols[1] full = list(filter(lambda e: len(e) > 0, [e.strip() for e in cols[2:]])) else: raise RuntimeError("word item should be: 你好 [priority n i h ao]") return WordPhoneTable( word=word, full=''.join(full), xhe=''.join([full_to_two(e, xhe_transformer) for e in full]), zrm=''.join([full_to_two(e, zrm_transformer) for e in full]), lu="", priority=priority, updatedt=datetime.now() )
def main(): if len(sys.argv) != 2: print(f"使用方法: python3 {sys.argv[0]} words.txt", file=sys.stderr) print("文件行格式:word [w1_yin w2_yin ... prioroty]") print("举例:你好 [ni hao 100]") print("中括号内为可选内容") sys.exit(1) _, words_path = sys.argv add_words = load_words(words_path) print(add_words) with db.atomic(): WordPhoneTable.bulk_create(add_words, batch_size=100) print(f'done, add {len(add_words)} items')
def get_exists_words() -> Set[str]: exist_words = set() exist_words.union(get_exists_chars()) for e in WordPhoneTable.select(): exist_words.add(e.word) for e in TangshiTable.select(): exist_words.add(e.word) return exist_words
def cols_to_word_phone_table(cols: List[str], xhe_transformer, zrm_transformer, bingji_transformer, lu_transformer) -> Union[WordPhoneTable, None]: if len(cols) == 1: word = cols[0] priority = 100 try: full = get_full(word) except Exception as e: print(e) return None # elif len(cols) == 2: # word = cols[0] # priority = cols[1] # full = get_full(word) elif len(cols) == 1 + len(cols[0]): word = cols[0] priority = 100 full = list(filter(lambda e: len(e) > 0, [e.strip() for e in cols[1:]])) elif len(cols) == 2 + len(cols[0]): word = cols[0] priority = int(cols[-1]) full = list( filter(lambda e: len(e) > 0, [e.strip() for e in cols[1:len(cols)]])) else: raise RuntimeError("word item should be: 你好 [ni hao 100]") item = WordPhoneTable( word=word, full=' '.join(full), xhe=''.join([full_to_two(e, xhe_transformer) for e in full]), zrm=''.join([full_to_two(e, zrm_transformer) for e in full]), lu=''.join([full_to_two(e, lu_transformer) for e in full]), priority=priority, updatedt=datetime.now(), bingji=''.join( full_to_two(e, bingji_transformer, bingji=True) for e in full)) print("add ", item) return item
def fill_lu(item: WordPhoneTable, lu: str) -> WordPhoneTable: item.lu = lu return item
fout.write(f"---config@码表别名=系统单字\n") pipe( CharPhoneTable.select().order_by(CharPhoneTable.priority.desc()), filter(lambda e: e.char in char_to_shape), map(lambda e: f"{e.char}\t{e.zrm+char_to_shape[e.char]}#序40000"), for_each(lambda e: fout.write(e + '\n')), ) del_words = pipe(DelWordTable.select(), map(lambda e: e.word), set) sys_word_data = f"{output_dir}/sys_word_data.txt" with open(sys_word_data, 'w', encoding='utf8') as fout: fout.write("---config@码表分类=主码-2\n") fout.write("---config@允许编辑=否\n") fout.write(f"---config@码表别名=系统词组\n") pipe( WordPhoneTable.select().order_by(fn.LENGTH(WordPhoneTable.word), WordPhoneTable.priority.desc()), filter(lambda e: e.word not in del_words), map(lambda e: (f'{e.word}\t{e.zrm}', e.word[0], e.word[-1])), filter(lambda e: e[1] in char_to_shape and e[2] in char_to_shape), map(lambda e: f'{e[0]}{char_to_shape[e[1]][0]}{char_to_shape[e[2]][-1]}#序20000' ), for_each(lambda e: fout.write(e + '\n'))) with open(f'{output_dir}/sys_eng_data.txt', 'w', encoding='utf8') as fout: fout.write("---config@码表分类=主码-3\n") fout.write("---config@允许编辑=否\n") fout.write(f"---config@码表别名=系统英文\n") pipe( EngWordTable.select().where(EngWordTable.priority > 100).order_by( fn.LENGTH(EngWordTable.word), EngWordTable.priority), filter(lambda e: is_all_alpha(e.word)),
def mean(lst: List[int]) -> int: if len(lst) == 0: return 1 else: return int(sum(lst) / len(lst)) if __name__ == "__main__": if len(sys.argv) != 2: print(f"Usage: python3 {sys.argv[0]} sents.txt", file=sys.stderr) sys.exit(1) _, sents_path = sys.argv exist_words = pipe(WordPhoneTable.select(), map(lambda e: e.word), set) seg = Segger(exist_words, 5) with open(sents_path, 'r', encoding='utf8') as fin: word_freq = pipe( fin, map(lambda e: e.strip().replace(" ", "").replace("\t", "")), filter(lambda e: e != "" and not e.startswith("#")), map(lambda e: seg.cut(e)), concat, groupby(lambda e: e), valmap(lambda e: len(e)), dict) index = 0 for item in WordPhoneTable.select(): index += 1 if index == 1000: print(item) index = 0
#with open(sents_path, 'r', encoding='utf8') as fin: # word_freq = pipe( # fin, map(lambda e: e.strip().replace(" ", "").replace("\t", "")), # filter(lambda e: e != "" and not e.startswith("#")), # map(lambda e: seg.cut(e)), concat, groupby(lambda e: e), # valmap(lambda e: len(e)), dict) chars_freq = {} for item in CharFreqTable.select(): if item.char in chars_freq: raise ("duplicated " + item.char) chars_freq[item.char] = item.freq index = 0 tosave_items = [] for item in WordPhoneTable.select().where(WordPhoneTable.priority <= 0): index += 1 if index == 10000: print(item) index = 0 with db.atomic(): WordPhoneTable.bulk_update(tosave_items, [WordPhoneTable.priority], batch_size=200) tosave_items.clear() word = item.word #if word in word_freq: # freq = word_freq[word] #else: # freq = 1
[tuple(e.split("\t")) for e in generate_one_hit_char(60000).keys()]) all_items.extend([ tuple(e.split("\t")) for e in generate_topest_char(char_to_phones, 60000) ]) #系统单字部分 all_items.extend( pipe(CharPhoneTable.select(), filter(lambda e: e.char in char_to_shape), map(lambda e: (e.char, f"{e.xhe+char_to_shape[e.char]}")), list)) del_words = pipe(DelWordTable.select(), map(lambda e: e.word), set) all_items.extend( pipe( WordPhoneTable.select(), filter(lambda e: e.word not in del_words), map(lambda e: (e.word, e.xhe, e.word[0], e.word[-1])), filter(lambda e: e[2] in char_to_shape and e[3] in char_to_shape), map(lambda e: (e[0], e[1] + char_to_shape[e[2]][0] + char_to_shape[ e[3]][-1])), list)) if not os.path.exists(output_dir): os.makedirs(output_dir) with open(output_dir + "/xiaolu_word_for_baidu.ini", 'w', encoding='utf8') as fout: for key, value in groupby(lambda e: e[1], sorted(all_items, key=lambda e: (e[1]))).items(): for i in range(len(value)): fout.write(f"{value[i][1]}={i+1},{value[i][0]}\n")
from toolz.curried import pipe, map from tables import db, DelWordTable, WordPhoneTable if __name__ == "__main__": del_words = pipe(DelWordTable.select(), map(lambda e: e.word), set) num = WordPhoneTable.select().where( WordPhoneTable.word.in_(del_words)).count() print(f"total {num} items to delete") WordPhoneTable.delete().where(WordPhoneTable.word.in_(del_words)).execute() print("done")
word_phones.append((word, f"{c1}{c2}")) elif len(phones) == 3: for c1 in phones[0]: for c2 in phones[1]: for c3 in phones[2]: word_phones.append((word, f"{c1}{c2}{c3}")) else: print(f"{word} {phones} lenght great than 3, exiting...") sys.exit(1) to_add_items = [] exist_items = set() for (word, phones) in word_phones: if f"{word}{phones}" in exist_items: continue if len(phones) != len(word)*2: print(f"D: {word} {phones} wrong.") continue num = WordPhoneTable.select().where(WordPhoneTable.word == word, WordPhoneTable.phones == phones).count() if num > 0: continue to_add_items.append(WordPhoneTable(word=word, phones=phones, priority=1, updatedt=datetime.now())) exist_items.add(f"{word}{phones}") # WordPhoneTable(word=word, phones=phones, priority=1, updatedt=datetime.now()).save() print(f"add length {len(to_add_items)}") with db.atomic(): WordPhoneTable.bulk_create(to_add_items, batch_size=100) print('done') pass
'[1234567890’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~,。!@#$%^&*………_+}{}]+', word) is None: return False else: return True if __name__ == "__main__": if len(sys.argv) != 2: print(f"USAGE: python3 {sys.argv[0]} words.txt", file=sys.stderr) sys.exit(1) _, words_path = sys.argv exist_words = set() exist_words = pipe(WordPhoneTable.select(), map(lambda e: e.word), set) exist_words = exist_words | pipe(DelWordTable.select(), map(lambda e: e.word), set) with open(words_path, "r", encoding='utf8') as fin: #FIXME: bug to fix, we have more phone type now. ft_dict = get_double_dict() to_add_words = pipe( fin, map(lambda e: e.strip().split('\t')), filter(lambda e: len(e) in (1, 2)), filter(lambda e: len(e[0]) <= 5), filter(lambda e: not contain_alpha(e[0]) and not contain_symbols(e[
if re.match('[1234567890’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~,。!@#$%^&*………_+}{}]+', word) is None: return False else: return True if __name__ == "__main__": if len(sys.argv) != 2: print(f"USAGE: python3 {sys.argv[0]} words.txt", file=sys.stderr) print("words format:word prioroty w1_yin w2_yin ...") sys.exit(1) _, words_path = sys.argv exist_words = set() exist_words = pipe(WordPhoneTable.select(), map(lambda e: e.word), set ) exist_words = exist_words | pipe(DelWordTable.select(), map(lambda e: e.word), set ) xhe_transformer = get_full_to_xhe_transformer(); zrm_transformer = get_full_to_zrm_transformmer(); lu_transformer = get_full_to_lu_transformmer(); with open(words_path, "r", encoding='utf8') as fin: to_add_words = pipe(fin,
word) is None: return False else: return True if __name__ == "__main__": if len(sys.argv) != 2: print(f"USAGE: python3 {sys.argv[0]} words.txt", file=sys.stderr) print("words format:word prioroty w1_yin w2_yin ...") sys.exit(1) _, words_path = sys.argv exist_words = set() exist_words = pipe(WordPhoneTable.select(), map(lambda e: e.word), set) exist_words = exist_words | pipe(DelWordTable.select(), map(lambda e: e.word), set) xhe_transformer = get_full_to_xhe_transformer() zrm_transformer = get_full_to_zrm_transformmer() lu_transformer = get_full_to_lu_transformmer() with open(words_path, "r", encoding='utf8') as fin: to_add_words = pipe( fin, map(lambda e: e.strip().split(' ')), filter(lambda e: len(e) in (1, 2)), filter(lambda e: len(e[0]) <= 5), filter(lambda e: not contain_alpha(e[0]) and not contain_symbols(e[ 0])), filter(lambda e: e[0] not in exist_words),
def fill_full(item: WordPhoneTable) -> WordPhoneTable: full = ''.join(get_full(item.word)) item.full = full return item
def fill_zrm(item: WordPhoneTable, zrm: str) -> WordPhoneTable: item.zrm = zrm return item
def fill_xhe(item: WordPhoneTable, xhe: str) -> WordPhoneTable: item.xhe = xhe return item
def fill_zrm(item: WordPhoneTable, zrm: str) -> WordPhoneTable: item.zrm = zrm return item def fill_lu(item: WordPhoneTable, lu: str) -> WordPhoneTable: item.lu = lu return item if __name__ == "__main__": print("check full") to_update_full_items = pipe(WordPhoneTable.select().where(WordPhoneTable.full == ""), map(lambda e: fill_full(e)), ) with db.atomic(): WordPhoneTable.bulk_update(to_update_full_items, fields=['full'], batch_size=100) del to_update_full_items print("check xhe") full_to_xhe_transformer = get_full_to_xhe_transformer() to_update_xhe_items = pipe(WordPhoneTable.select().where(WordPhoneTable.xhe == ""), map(lambda e: (e, word_to_two(e.word, full_to_xhe_transformer))), map(lambda e: fill_xhe(e[0], e[1])), ) with db.atomic(): WordPhoneTable.bulk_update(to_update_xhe_items, fields=['xhe'], batch_size=100) del to_update_xhe_items
def mean(lst: List[int]) -> int: if len(lst) == 0: return 1 else: return int(sum(lst)/len(lst)) if __name__ == "__main__": if len(sys.argv) != 2: print(f"Usage: python3 {sys.argv[0]} sents.txt", file=sys.stderr) sys.exit(1) _, sents_path = sys.argv exist_words = pipe(WordPhoneTable.select(), map(lambda e: e.word), set ) seg = Segger(exist_words, 5) with open(sents_path, 'r', encoding='utf8') as fin: word_freq = pipe(fin, map(lambda e: e.strip().replace(" ", "").replace("\t", "")), filter(lambda e: e != "" and not e.startswith("#")), map(lambda e: seg.cut(e)), concat, groupby(lambda e: e), valmap(lambda e: len(e)), dict )
def full_to_double(pinyin, full_to_two): return [full_to_two[e[0]] + full_to_two[e[1]] for e in pinyin] def get_double_dict(): full_to_two = {} for item in FullToTwoTable.select(): if item.full in full_to_two: print(f"ERROR in {item.full}") sys.exit(1) else: full_to_two[item.full] = item.two return full_to_two if __name__ == "__main__": full_to_two = get_double_dict() for item in WordPhoneTable.select(): word = item.word phones = item.phones pinyin = [split_sy(e) for e in lazy_pinyin(word)] # print(word, phones, pinyin) double = ''.join(full_to_double(pinyin, full_to_two)) if phones != double: print(f"diff in {item.id}, {word}, {phones}, {double}") item.delete_instance() print("done")