def update_word_full(char_phones: Dict[str, List[str]]): to_update_items = [] for item in WordPhoneTable.select(): words: str = item.word full: str = item.full if len(words) == len(full.split(' ')): continue if full == '': full = ' '.join(get_full(words)) item.full = full to_update_items.append(item) continue words_candidate_fulls: List[List[str]] = [] for char in words: if char not in char_phones: print(f"{char} not in phone table") continue # FIXME: # raise RuntimeError(f"{char} in phone table") else: words_candidate_fulls.append( sorted(char_phones[char], key=lambda e: -len(e))) full_arr: List[Tuple[List[str], str]] = [] for word_candidate_fulls in words_candidate_fulls: if len(full_arr) <= 0: # 第一个字 for candidate_full in word_candidate_fulls: if full.startswith(candidate_full): full_arr.append( ([candidate_full], full[len(candidate_full):])) else: broken_segments = [] this_full_arr: List[Tuple[List[str], str]] = [] for pre_segment in full_arr: next_full = pre_segment[1] for candidate_full in word_candidate_fulls: if next_full.startswith(candidate_full): this_segments = [] this_segments.extend(pre_segment[0]) this_segments.append(candidate_full) this_next_full = next_full[len(candidate_full):] this_full_arr.append( (this_segments, this_next_full)) full_arr = this_full_arr full_arr = [e for e in full_arr if len(e[0]) > 0 and e[1] == ''] if len(full_arr) != 1: print(f"wrong format: {item}, {full_arr}") # FIXME: # raise RuntimeError(f"get full pinyin fails, {item}") else: item.full = ' '.join(full_arr[0][0]) to_update_items.append(item) if len(to_update_items) > 0: print(f"total have {len(to_update_items)} items to update") with db.atomic(): WordPhoneTable.bulk_update(to_update_items, fields=['full'], batch_size=100) print("done")
def check_wordphonetable_pinyin(transformer: Dict[str, str], schema: ShuangPinSchema): to_update_items = [] for item in WordPhoneTable.select(): fulls = item.full if schema == XHE_SP_SCHEMA: shuangpin = item.xhe elif schema == LU_SP_SCHEMA: shuangpin = item.lu elif schema == ZRM_SP_SCHEMA: shuangpin = item.zrm elif schema == BINGJI_SP_SCHEMA: shuangpin = item.bingji else: raise RuntimeError(f'unknown schema: {schema}') full_shuangpins_arr = [] for full in fulls.split(' '): s, y = split_sy(full) sp = transformer[s] + transformer[y] full_shuangpins_arr.append(sp) full_shuangpins = ''.join(full_shuangpins_arr) if full_shuangpins != shuangpin: if schema == XHE_SP_SCHEMA: item.xhe = full_shuangpins elif schema == LU_SP_SCHEMA: item.lu = full_shuangpins elif schema == ZRM_SP_SCHEMA: item.zrm = full_shuangpins elif schema == BINGJI_SP_SCHEMA: item.bingji = full_shuangpins else: raise RuntimeError(f'unknown schema: {schema}') to_update_items.append(item) with db.atomic(): if schema == XHE_SP_SCHEMA: WordPhoneTable.bulk_update(to_update_items, fields=['xhe'], batch_size=100) elif schema == LU_SP_SCHEMA: WordPhoneTable.bulk_update(to_update_items, fields=['lu'], batch_size=100) elif schema == ZRM_SP_SCHEMA: WordPhoneTable.bulk_update(to_update_items, fields=['zrm'], batch_size=100) elif schema == BINGJI_SP_SCHEMA: WordPhoneTable.bulk_update(to_update_items, fields=['bingji'], batch_size=100) else: raise RuntimeError(f'unknown schema: {schema}') print(to_update_items) print(f'update {len(to_update_items)} wordphonetable items')
chars_freq = {} for item in CharFreqTable.select(): if item.char in chars_freq: raise ("duplicated " + item.char) chars_freq[item.char] = item.freq index = 0 tosave_items = [] for item in WordPhoneTable.select().where(WordPhoneTable.priority <= 0): index += 1 if index == 10000: print(item) index = 0 with db.atomic(): WordPhoneTable.bulk_update(tosave_items, [WordPhoneTable.priority], batch_size=200) tosave_items.clear() word = item.word #if word in word_freq: # freq = word_freq[word] #else: # freq = 1 freqs = [(chars_freq[word[e]] if word[e] in chars_freq else 10) for e in range(len(word))] # print(freqs) priority = get_priority(freqs) item.priority = priority tosave_items.append(item)
return item def fill_lu(item: WordPhoneTable, lu: str) -> WordPhoneTable: item.lu = lu return item if __name__ == "__main__": print("check full") to_update_full_items = pipe(WordPhoneTable.select().where(WordPhoneTable.full == ""), map(lambda e: fill_full(e)), ) with db.atomic(): WordPhoneTable.bulk_update(to_update_full_items, fields=['full'], batch_size=100) del to_update_full_items print("check xhe") full_to_xhe_transformer = get_full_to_xhe_transformer() to_update_xhe_items = pipe(WordPhoneTable.select().where(WordPhoneTable.xhe == ""), map(lambda e: (e, word_to_two(e.word, full_to_xhe_transformer))), map(lambda e: fill_xhe(e[0], e[1])), ) with db.atomic(): WordPhoneTable.bulk_update(to_update_xhe_items, fields=['xhe'], batch_size=100) del to_update_xhe_items del full_to_xhe_transformer print("check zrm") full_to_zrm_transformer = get_full_to_zrm_transformmer()