def get_char_to_lu_phones() -> Dict[str, List[str]]: char_to_phones = pipe(CharPhoneTable.select(), map(lambda e: (e.char, e.lu)), filter(lambda e: e[0] != '' and e[1] != ''), groupby(lambda e: e[0]), valmap(lambda phones: [e[1] for e in phones]), dict) return char_to_phones
def load_char_phones() -> Dict[str, List[str]]: result = {} for item in CharPhoneTable.select(): if item.char in result: result[item.char].append(item.full) else: result[item.char] = [item.full] return result
def check_chars_pinyin(transformer: Dict[str, str], schema: ShuangPinSchema): to_update_items = [] for item in CharPhoneTable.select(): full = item.full if schema == XHE_SP_SCHEMA: shuangpin = item.xhe elif schema == LU_SP_SCHEMA: shuangpin = item.lu elif schema == ZRM_SP_SCHEMA: shuangpin = item.zrm elif schema == BINGJI_SP_SCHEMA: shuangpin = item.bingji else: raise RuntimeError(f"unkonwn schame {schema}") s, y = split_sy(full) sp = transformer[s] + transformer[y] if shuangpin != sp: if schema == XHE_SP_SCHEMA: item.xhe = sp elif schema == LU_SP_SCHEMA: item.lu = sp elif schema == ZRM_SP_SCHEMA: item.zrm = sp elif schema == BINGJI_SP_SCHEMA: item.bingji = sp else: raise RuntimeError(f"unkonwn schame {schema}") to_update_items.append(item) with db.atomic(): if schema == XHE_SP_SCHEMA: CharPhoneTable.bulk_update(to_update_items, fields=['xhe'], batch_size=100) elif schema == LU_SP_SCHEMA: CharPhoneTable.bulk_update(to_update_items, fields=['lu'], batch_size=100) elif schema == ZRM_SP_SCHEMA: CharPhoneTable.bulk_update(to_update_items, fields=['zrm'], batch_size=100) elif schema == BINGJI_SP_SCHEMA: CharPhoneTable.bulk_update(to_update_items, fields=['bingji'], batch_size=100) else: raise RuntimeError(f"unkonwn schame {schema}") print(to_update_items) print(f'update {len(to_update_items)} char items')
def get_exists_chars() -> Set[str]: exist_chars = set() for e in CharPhoneTable.select(): exist_chars.add(e.char) return exist_chars
if len(sys.argv) != 1: print("USAGE: python3 generate_dd_txt.py ") sys.exit(1) fname, output_dir = sys.argv[0], "zrm_phone_xhe_shape" if not Path(output_dir).exists(): os.makedirs(output_dir) char_to_shape = pipe(CharShapeTable.select(), map(lambda e: (e.char, e.shapes)), reduceby(lambda e: e[0], lambda e1, e2: e1), valmap(lambda e: e[1]), dict) print(f"total {len(char_to_shape)} char shapes") char_to_phones = pipe(CharPhoneTable.select(), map(lambda e: (e.char, e.zrm)), groupby(lambda e: e[0]), valmap(lambda phones: [e[1] for e in phones]), dict) print(f"total {len(char_to_phones)} char phones") one_hit_char_items = generate_one_hit_char(60000) top_single_chars_items = generate_topest_char(char_to_phones, 60000) sys_top_chars_data = f"{output_dir}/sys_top_chars_data.txt" with open(sys_top_chars_data, 'w', encoding='utf8') as fout: fout.write("---config@码表分类=主码-1\n") fout.write("---config@允许编辑=否\n") fout.write(f"---config@码表别名=简码单字\n") for item in one_hit_char_items.items(): fout.write(f"{item[0]}#序{item[1]}\n") for item in top_single_chars_items.items():
再进入“高级设置→管理个性短语→导入个性短语”导入“xiaolu_word_for_baidu.ini”文件即可。 ''') sys.exit(1) fname, output_dir = sys.argv[0], "baidu_mobile_ini" if not Path(output_dir).exists(): os.makedirs(output_dir) char_to_shape = pipe(CharShapeTable.select(), map(lambda e: (e.char, e.shapes)), reduceby(lambda e: e[0], lambda e1, e2: e1), valmap(lambda e: e[1]), dict) print(f"total {len(char_to_shape)} char shapes") char_to_phones = pipe(CharPhoneTable.select(), map(lambda e: (e.char, e.xhe)), groupby(lambda e: e[0]), valmap(lambda phones: [e[1] for e in phones]), dict) print(f"total {len(char_to_phones)} char phones") all_items = [] #单字部分 all_items.extend( [tuple(e.split("\t")) for e in generate_one_hit_char(60000).keys()]) all_items.extend([ tuple(e.split("\t")) for e in generate_topest_char(char_to_phones, 60000) ]) #系统单字部分
import sys, os from collections import defaultdict from datetime import datetime from tables import db, WordPhoneTable, CharPhoneTable if __name__ == "__main__": if len(sys.argv) != 2: print("USAGE: python3 dump_word_nophones.py word.txt") sys.exit(1) _, word_txt_path = sys.argv char_to_phones = defaultdict(list) for item in CharPhoneTable.select(): if item.phones not in char_to_phones[item.char]: char_to_phones[item.char].append(item.phones) with open(word_txt_path, 'r', encoding='utf8') as fin: word_phones = [] for line in fin: line = line.strip() if line == "" or line.startswith(";"): continue word = "".join([e for e in line if e not in "abcdefghijklmnopqrstuvwxyz"]) if len(word) > 3: continue phones = [] for char in word: if char not in char_to_phones:
def fill_zrm(item: CharPhoneTable, transformer: Dict[str, str]) -> Tuple[CharPhoneTable, bool]: sy = split_sy(item.full) if sy[0] not in transformer or sy[1] not in transformer: print(f"{sy} not in transformer", file=sys.stderr) return item, False item.zrm = transformer[sy[0]] + transformer[sy[1]] return item, True if __name__ == "__main__": null_phones_items = pipe( CharPhoneTable.select().where(CharPhoneTable.phones == ''), list, ) if len(null_phones_items) != 0: pipe( null_phones_items, for_each(lambda e: print(e)), ) print(f"null phones item is: {len(null_phones_items)}") sys.exit(1) del null_phones_items null_full_items = pipe( CharPhoneTable.select().where(CharPhoneTable.full == ''), list,
import sys, os from datetime import datetime from tables import db, CharPhoneTable if __name__ == "__main__": if len(sys.argv) != 2: print("USAGE: python3 dump_char_phone_table.py char_phone.txt") sys.exit(1) _, char_phone_path = sys.argv with open(char_phone_path, 'r', encoding='utf8') as fin: for line in fin: line = line.strip() cols = line.split('\t') if len(cols) != 2: print(f"ERROR line {line} in file {char_phone_path}") continue cols = list(map(lambda e: e.strip(), cols)) exit_num = CharPhoneTable.select().where(CharPhoneTable.char == cols[0], CharPhoneTable.phones == cols[1]).count() if exit_num > 0: print(f"WARNING: char phone already exists, {line}") continue else: CharPhoneTable(char=cols[0], phones=cols[1], priority=1, updatedt=datetime.now()).save() print('done') pass
raise RuntimeError(f"{sy} not in transformer") item.zrm = transformer[sy[0]] + transformer[sy[1]] return item def fill_lu(item: CharPhoneTable, transformer: Dict[str, str]) -> CharPhoneTable: sy = split_sy(item.full) if sy[0] not in transformer or sy[1] not in transformer: raise RuntimeError(f"{sy} not in transformer") item.lu = transformer[sy[0]] + transformer[sy[1]] return item if __name__ == "__main__": null_xhe_count = CharPhoneTable.select().where(CharPhoneTable.xhe == '').count() if null_xhe_count != 0: print(f'{null_xhe_count} null xhe phones, please check manually.') null_full_count = CharPhoneTable.select().where(CharPhoneTable.full == '').count() if null_full_count != 0: to_update_full_items = pipe(CharPhoneTable.select().where(CharPhoneTable.full == ''), map(lambda e: (e, ''.join(get_full(e.char)))), map(lambda e: update_full(e[0], e[1])), ) with db.atomic(): CharPhoneTable.bulk_update(to_update_full_items, fields=['full'], batch_size=100) del to_update_full_items null_full_count = CharPhoneTable.select().where(CharPhoneTable.full == '').count() if null_full_count != 0: print(f'{null_full_count} null full full phones, please check manually.')
item.zrm = transformer[sy[0]] + transformer[sy[1]] return item def fill_lu(item: CharPhoneTable, transformer: Dict[str, str]) -> CharPhoneTable: sy = split_sy(item.full) if sy[0] not in transformer or sy[1] not in transformer: raise RuntimeError(f"{sy} not in transformer") item.lu = transformer[sy[0]] + transformer[sy[1]] return item if __name__ == "__main__": null_xhe_count = CharPhoneTable.select().where( CharPhoneTable.xhe == '').count() if null_xhe_count != 0: print(f'{null_xhe_count} null xhe phones, please check manually.') null_full_count = CharPhoneTable.select().where( CharPhoneTable.full == '').count() if null_full_count != 0: to_update_full_items = pipe( CharPhoneTable.select().where(CharPhoneTable.full == ''), map(lambda e: (e, ''.join(get_full(e.char)))), map(lambda e: update_full(e[0], e[1])), ) with db.atomic(): CharPhoneTable.bulk_update(to_update_full_items, fields=['full'], batch_size=100)