def update_word_full(char_phones: Dict[str, List[str]]): to_update_items = [] for item in WordPhoneTable.select(): words: str = item.word full: str = item.full if len(words) == len(full.split(' ')): continue if full == '': full = ' '.join(get_full(words)) item.full = full to_update_items.append(item) continue words_candidate_fulls: List[List[str]] = [] for char in words: if char not in char_phones: print(f"{char} not in phone table") continue # FIXME: # raise RuntimeError(f"{char} in phone table") else: words_candidate_fulls.append( sorted(char_phones[char], key=lambda e: -len(e))) full_arr: List[Tuple[List[str], str]] = [] for word_candidate_fulls in words_candidate_fulls: if len(full_arr) <= 0: # 第一个字 for candidate_full in word_candidate_fulls: if full.startswith(candidate_full): full_arr.append( ([candidate_full], full[len(candidate_full):])) else: broken_segments = [] this_full_arr: List[Tuple[List[str], str]] = [] for pre_segment in full_arr: next_full = pre_segment[1] for candidate_full in word_candidate_fulls: if next_full.startswith(candidate_full): this_segments = [] this_segments.extend(pre_segment[0]) this_segments.append(candidate_full) this_next_full = next_full[len(candidate_full):] this_full_arr.append( (this_segments, this_next_full)) full_arr = this_full_arr full_arr = [e for e in full_arr if len(e[0]) > 0 and e[1] == ''] if len(full_arr) != 1: print(f"wrong format: {item}, {full_arr}") # FIXME: # raise RuntimeError(f"get full pinyin fails, {item}") else: item.full = ' '.join(full_arr[0][0]) to_update_items.append(item) if len(to_update_items) > 0: print(f"total have {len(to_update_items)} items to update") with db.atomic(): WordPhoneTable.bulk_update(to_update_items, fields=['full'], batch_size=100) print("done")
def check_tangshitable_pinyin(transformer: Dict[str, str], schema: ShuangPinSchema): to_update_items = [] for item in TangshiTable.select(): fulls = item.full if schema == XHE_SP_SCHEMA: shuangpin = item.xhe elif schema == LU_SP_SCHEMA: shuangpin = item.lu elif schema == ZRM_SP_SCHEMA: shuangpin = item.zrm elif schema == BINGJI_SP_SCHEMA: shuangpin = item.bingji else: raise RuntimeError(f'unknown schema: {schema}') full_shuangpins_arr = [] for full in fulls.split(' '): s, y = split_sy(full) sp = transformer[s] + transformer[y] full_shuangpins_arr.append(sp) full_shuangpins = ''.join(full_shuangpins_arr) if full_shuangpins != shuangpin: if schema == XHE_SP_SCHEMA: item.xhe = full_shuangpins elif schema == LU_SP_SCHEMA: item.lu = full_shuangpins elif schema == ZRM_SP_SCHEMA: item.zrm = full_shuangpins elif schema == BINGJI_SP_SCHEMA: item.bingji = full_shuangpins else: raise RuntimeError(f'unknown schema: {schema}') to_update_items.append(item) with db.atomic(): if schema == XHE_SP_SCHEMA: TangshiTable.bulk_update(to_update_items, fields=['xhe'], batch_size=100) elif schema == LU_SP_SCHEMA: TangshiTable.bulk_update(to_update_items, fields=['lu'], batch_size=100) elif schema == ZRM_SP_SCHEMA: TangshiTable.bulk_update(to_update_items, fields=['zrm'], batch_size=100) elif schema == BINGJI_SP_SCHEMA: TangshiTable.bulk_update(to_update_items, fields=['bingji'], batch_size=100) else: raise RuntimeError(f'unknown schema: {schema}') print(to_update_items) print(f'update {len(to_update_items)} tangshitable items')
def check_chars_pinyin(transformer: Dict[str, str], schema: ShuangPinSchema): to_update_items = [] for item in CharPhoneTable.select(): full = item.full if schema == XHE_SP_SCHEMA: shuangpin = item.xhe elif schema == LU_SP_SCHEMA: shuangpin = item.lu elif schema == ZRM_SP_SCHEMA: shuangpin = item.zrm elif schema == BINGJI_SP_SCHEMA: shuangpin = item.bingji else: raise RuntimeError(f"unkonwn schame {schema}") s, y = split_sy(full) sp = transformer[s] + transformer[y] if shuangpin != sp: if schema == XHE_SP_SCHEMA: item.xhe = sp elif schema == LU_SP_SCHEMA: item.lu = sp elif schema == ZRM_SP_SCHEMA: item.zrm = sp elif schema == BINGJI_SP_SCHEMA: item.bingji = sp else: raise RuntimeError(f"unkonwn schame {schema}") to_update_items.append(item) with db.atomic(): if schema == XHE_SP_SCHEMA: CharPhoneTable.bulk_update(to_update_items, fields=['xhe'], batch_size=100) elif schema == LU_SP_SCHEMA: CharPhoneTable.bulk_update(to_update_items, fields=['lu'], batch_size=100) elif schema == ZRM_SP_SCHEMA: CharPhoneTable.bulk_update(to_update_items, fields=['zrm'], batch_size=100) elif schema == BINGJI_SP_SCHEMA: CharPhoneTable.bulk_update(to_update_items, fields=['bingji'], batch_size=100) else: raise RuntimeError(f"unkonwn schame {schema}") print(to_update_items) print(f'update {len(to_update_items)} char items')
def main(): if len(sys.argv) != 2: print(f"使用方法: python3 {sys.argv[0]} words.txt", file=sys.stderr) print("文件行格式:word [prioroty w1_yin w2_yin ...]") print("举例:你好 [10 ni hao]") print("中括号内为可选内容") sys.exit(1) _, words_path = sys.argv add_words = load_words(words_path) print(add_words) with db.atomic(): TangshiTable.bulk_create(add_words, batch_size=100) print(f'done, add {len(add_words)} items')
# valmap(lambda e: len(e)), dict) chars_freq = {} for item in CharFreqTable.select(): if item.char in chars_freq: raise ("duplicated " + item.char) chars_freq[item.char] = item.freq index = 0 tosave_items = [] for item in WordPhoneTable.select().where(WordPhoneTable.priority <= 0): index += 1 if index == 10000: print(item) index = 0 with db.atomic(): WordPhoneTable.bulk_update(tosave_items, [WordPhoneTable.priority], batch_size=200) tosave_items.clear() word = item.word #if word in word_freq: # freq = word_freq[word] #else: # freq = 1 freqs = [(chars_freq[word[e]] if word[e] in chars_freq else 10) for e in range(len(word))] # print(freqs) priority = get_priority(freqs)