def cols_to_tangshi_item(cols: List[str], xhe_transformer, zrm_transformer, bingji_transformer, lu_transformer) -> TangshiTable: if len(cols) == 1: word = cols[0] priority = 100 full = get_full(word) elif len(cols) == 2: word = cols[0] priority = cols[1] full = get_full(word) elif len(cols) == 2 + len(cols[0]): word = cols[0] priority = cols[1] full = list(filter(lambda e: len(e) > 0, [e.strip() for e in cols[2:]])) else: raise RuntimeError("word item should be: 你好 [priority ni hao]") item = TangshiTable( word=word, full=' '.join(full), xhe=''.join([full_to_two(e, xhe_transformer) for e in full]), zrm=''.join([full_to_two(e, zrm_transformer) for e in full]), lu=''.join([full_to_two(e, lu_transformer) for e in full]), priority=priority, updatedt=datetime.now(), bingji=''.join( full_to_two(e, bingji_transformer, bingji=True) for e in full)) print("add ", item) return item
def cols_to_word_phone_table(cols: List[str], xhe_transformer, zrm_transformer) -> WordPhoneTable: if len(cols) == 1: word = cols[0] priority = 1 full = get_full(word) elif len(cols) == 2: word = cols[0] priority = cols[1] full = get_full(word) elif len(cols) == 2 + len(cols[0]): word = cols[0] priority = cols[1] full = list(filter(lambda e: len(e) > 0, [e.strip() for e in cols[2:]])) else: raise RuntimeError("word item should be: 你好 [priority n i h ao]") return WordPhoneTable( word=word, full=''.join(full), xhe=''.join([full_to_two(e, xhe_transformer) for e in full]), zrm=''.join([full_to_two(e, zrm_transformer) for e in full]), lu="", priority=priority, updatedt=datetime.now() )
def update_word_full(char_phones: Dict[str, List[str]]): to_update_items = [] for item in WordPhoneTable.select(): words: str = item.word full: str = item.full if len(words) == len(full.split(' ')): continue if full == '': full = ' '.join(get_full(words)) item.full = full to_update_items.append(item) continue words_candidate_fulls: List[List[str]] = [] for char in words: if char not in char_phones: print(f"{char} not in phone table") continue # FIXME: # raise RuntimeError(f"{char} in phone table") else: words_candidate_fulls.append( sorted(char_phones[char], key=lambda e: -len(e))) full_arr: List[Tuple[List[str], str]] = [] for word_candidate_fulls in words_candidate_fulls: if len(full_arr) <= 0: # 第一个字 for candidate_full in word_candidate_fulls: if full.startswith(candidate_full): full_arr.append( ([candidate_full], full[len(candidate_full):])) else: broken_segments = [] this_full_arr: List[Tuple[List[str], str]] = [] for pre_segment in full_arr: next_full = pre_segment[1] for candidate_full in word_candidate_fulls: if next_full.startswith(candidate_full): this_segments = [] this_segments.extend(pre_segment[0]) this_segments.append(candidate_full) this_next_full = next_full[len(candidate_full):] this_full_arr.append( (this_segments, this_next_full)) full_arr = this_full_arr full_arr = [e for e in full_arr if len(e[0]) > 0 and e[1] == ''] if len(full_arr) != 1: print(f"wrong format: {item}, {full_arr}") # FIXME: # raise RuntimeError(f"get full pinyin fails, {item}") else: item.full = ' '.join(full_arr[0][0]) to_update_items.append(item) if len(to_update_items) > 0: print(f"total have {len(to_update_items)} items to update") with db.atomic(): WordPhoneTable.bulk_update(to_update_items, fields=['full'], batch_size=100) print("done")
def cols_to_word_phone_table(cols: List[str], xhe_transformer, zrm_transformer, bingji_transformer, lu_transformer) -> Union[WordPhoneTable, None]: if len(cols) == 1: word = cols[0] priority = 100 try: full = get_full(word) except Exception as e: print(e) return None # elif len(cols) == 2: # word = cols[0] # priority = cols[1] # full = get_full(word) elif len(cols) == 1 + len(cols[0]): word = cols[0] priority = 100 full = list(filter(lambda e: len(e) > 0, [e.strip() for e in cols[1:]])) elif len(cols) == 2 + len(cols[0]): word = cols[0] priority = int(cols[-1]) full = list( filter(lambda e: len(e) > 0, [e.strip() for e in cols[1:len(cols)]])) else: raise RuntimeError("word item should be: 你好 [ni hao 100]") item = WordPhoneTable( word=word, full=' '.join(full), xhe=''.join([full_to_two(e, xhe_transformer) for e in full]), zrm=''.join([full_to_two(e, zrm_transformer) for e in full]), lu=''.join([full_to_two(e, lu_transformer) for e in full]), priority=priority, updatedt=datetime.now(), bingji=''.join( full_to_two(e, bingji_transformer, bingji=True) for e in full)) print("add ", item) return item
def fill_full(item: WordPhoneTable) -> WordPhoneTable: full = ''.join(get_full(item.word)) item.full = full return item
for_each(lambda e: print(e)), ) print(f"null phones item is: {len(null_phones_items)}") sys.exit(1) del null_phones_items null_full_items = pipe( CharPhoneTable.select().where(CharPhoneTable.full == ''), list, ) if len(null_full_items) != 0: print(f"null full items is {len(null_full_items)}") pipe( null_full_items, map(lambda e: (e, ''.join(get_full(e.char)))), map(lambda e: update_full(e[0], e[1])), for_each(lambda e: e.save()), ) del null_full_items full_to_xhe_transformer = get_full_to_xhe_transformer() xhe_full_neq_items = pipe( CharPhoneTable.select(), filter(lambda e: e.phones != full_to_two(e.full, full_to_xhe_transformer)), list, ) if len(xhe_full_neq_items) != 0: print(f"xhe full not equal len is {len(xhe_full_neq_items)}")
def load_chars(filepath: str): exist_charpinyins = common.get_exists_charyinpins() chars = [] shapes = [] with open(filepath, 'r', encoding='utf8') as fin: for line in fin: line = line.strip() if len(line) == 0: continue cols = line.split(" ") if len(cols) < 2: print(f"{line} broken") continue pinyin = None priority = 1 if len(cols) == 2: char = cols[0] shape = cols[1] elif len(cols) == 3: char = cols[0] shape = cols[1] pinyin = cols[2] elif len(cols) == 4: char = cols[0] shape = cols[1] pinyin = cols[2] priority = int(cols[3]) else: print(f"broken line {line}") continue if common.contain_alpha(word=char) or common.contain_alpha(word=char): print(f"broken line {line}") continue if len(char) != 1: print(f"broken line {line}") continue if shape is None or not shape.isalpha(): print(f"broken line {line}") continue if pinyin is not None and not pinyin.isalpha(): print(f"broken line {line}") continue if pinyin is None: pinyin = ''.join(common.get_full(char)) if priority is None or priority < 1: priority = 1 if char + pinyin in exist_charpinyins: print(f"already exists {line}") continue exist_charpinyins.add(char+pinyin) chars.append(tables.CharPhoneTable( char=char, full=pinyin, xhe='', lu='', zrm='', bingji='', priority=priority, updatedt=datetime.now(), )) shapes.append(tables.CharHeShapeTable( char=char, shapes=shape, priority=priority, updatedt=datetime.now(), )) with tables.db.atomic(): tables.CharHeShapeTable.bulk_create(shapes, batch_size=100) print(f"add he shape: {shapes}") print(f"add he shape num: {len(shapes)}") with tables.db.atomic(): tables.CharPhoneTable.bulk_create(chars, batch_size=100) print(f"add char phone: {chars}") print(f"add char phone num: {len(chars)}")