コード例 #1
0
def update_word_full(char_phones: Dict[str, List[str]]):
    to_update_items = []
    for item in WordPhoneTable.select():
        words: str = item.word
        full: str = item.full
        if len(words) == len(full.split(' ')):
            continue
        if full == '':
            full = ' '.join(get_full(words))
            item.full = full
            to_update_items.append(item)
            continue

        words_candidate_fulls: List[List[str]] = []
        for char in words:
            if char not in char_phones:
                print(f"{char} not in phone table")
                continue
                # FIXME:
                # raise RuntimeError(f"{char} in phone table")
            else:
                words_candidate_fulls.append(
                    sorted(char_phones[char], key=lambda e: -len(e)))
        full_arr: List[Tuple[List[str], str]] = []
        for word_candidate_fulls in words_candidate_fulls:
            if len(full_arr) <= 0:  # 第一个字
                for candidate_full in word_candidate_fulls:
                    if full.startswith(candidate_full):
                        full_arr.append(
                            ([candidate_full], full[len(candidate_full):]))
            else:
                broken_segments = []
                this_full_arr: List[Tuple[List[str], str]] = []
                for pre_segment in full_arr:
                    next_full = pre_segment[1]
                    for candidate_full in word_candidate_fulls:
                        if next_full.startswith(candidate_full):
                            this_segments = []
                            this_segments.extend(pre_segment[0])
                            this_segments.append(candidate_full)
                            this_next_full = next_full[len(candidate_full):]
                            this_full_arr.append(
                                (this_segments, this_next_full))
                full_arr = this_full_arr
        full_arr = [e for e in full_arr if len(e[0]) > 0 and e[1] == '']
        if len(full_arr) != 1:
            print(f"wrong format: {item}, {full_arr}")
            # FIXME:
            # raise RuntimeError(f"get full pinyin fails, {item}")
        else:
            item.full = ' '.join(full_arr[0][0])
            to_update_items.append(item)

    if len(to_update_items) > 0:
        print(f"total have {len(to_update_items)} items to update")
        with db.atomic():
            WordPhoneTable.bulk_update(to_update_items,
                                       fields=['full'],
                                       batch_size=100)
    print("done")
コード例 #2
0
def check_wordphonetable_pinyin(transformer: Dict[str, str], schema: ShuangPinSchema):
    to_update_items = []
    for item in WordPhoneTable.select():
        fulls = item.full
        if schema == XHE_SP_SCHEMA:
            shuangpin = item.xhe
        elif schema == LU_SP_SCHEMA:
            shuangpin = item.lu
        elif schema == ZRM_SP_SCHEMA:
            shuangpin = item.zrm
        elif schema == BINGJI_SP_SCHEMA:
            shuangpin = item.bingji
        else:
            raise RuntimeError(f'unknown schema: {schema}')

        full_shuangpins_arr = []
        for full in fulls.split(' '):
            s, y = split_sy(full)
            sp = transformer[s] + transformer[y]
            full_shuangpins_arr.append(sp)
        full_shuangpins = ''.join(full_shuangpins_arr)
        if full_shuangpins != shuangpin:
            if schema == XHE_SP_SCHEMA:
                item.xhe = full_shuangpins
            elif schema == LU_SP_SCHEMA:
                item.lu = full_shuangpins
            elif schema == ZRM_SP_SCHEMA:
                item.zrm = full_shuangpins
            elif schema == BINGJI_SP_SCHEMA:
                item.bingji = full_shuangpins
            else:
                raise RuntimeError(f'unknown schema: {schema}')
            to_update_items.append(item)

    with db.atomic():
        if schema == XHE_SP_SCHEMA:
            WordPhoneTable.bulk_update(to_update_items,
                                       fields=['xhe'],
                                       batch_size=100)
        elif schema == LU_SP_SCHEMA:
            WordPhoneTable.bulk_update(to_update_items,
                                       fields=['lu'],
                                       batch_size=100)
        elif schema == ZRM_SP_SCHEMA:
            WordPhoneTable.bulk_update(to_update_items,
                                       fields=['zrm'],
                                       batch_size=100)
        elif schema == BINGJI_SP_SCHEMA:
            WordPhoneTable.bulk_update(to_update_items,
                                       fields=['bingji'],
                                       batch_size=100)
        else:
            raise RuntimeError(f'unknown schema: {schema}')

    print(to_update_items)
    print(f'update {len(to_update_items)} wordphonetable items')
コード例 #3
0
    chars_freq = {}
    for item in CharFreqTable.select():
        if item.char in chars_freq:
            raise ("duplicated " + item.char)
        chars_freq[item.char] = item.freq

    index = 0
    tosave_items = []
    for item in WordPhoneTable.select().where(WordPhoneTable.priority <= 0):
        index += 1
        if index == 10000:
            print(item)
            index = 0
            with db.atomic():
                WordPhoneTable.bulk_update(tosave_items,
                                           [WordPhoneTable.priority],
                                           batch_size=200)
            tosave_items.clear()

        word = item.word
        #if word in word_freq:
        #    freq = word_freq[word]
        #else:
        #    freq = 1

        freqs = [(chars_freq[word[e]] if word[e] in chars_freq else 10)
                 for e in range(len(word))]
        # print(freqs)
        priority = get_priority(freqs)
        item.priority = priority
        tosave_items.append(item)
コード例 #4
0
    return item


def fill_lu(item: WordPhoneTable, lu: str) -> WordPhoneTable:
    item.lu = lu
    return item


if __name__ == "__main__":

    print("check full")
    to_update_full_items = pipe(WordPhoneTable.select().where(WordPhoneTable.full == ""),
        map(lambda e: fill_full(e)),
    )
    with db.atomic():
        WordPhoneTable.bulk_update(to_update_full_items, fields=['full'], batch_size=100)
    del to_update_full_items

    print("check xhe")
    full_to_xhe_transformer = get_full_to_xhe_transformer()
    to_update_xhe_items = pipe(WordPhoneTable.select().where(WordPhoneTable.xhe == ""),
        map(lambda e: (e, word_to_two(e.word, full_to_xhe_transformer))),
        map(lambda e: fill_xhe(e[0], e[1])),
    )
    with db.atomic():
        WordPhoneTable.bulk_update(to_update_xhe_items, fields=['xhe'], batch_size=100)
    del to_update_xhe_items
    del full_to_xhe_transformer

    print("check zrm")
    full_to_zrm_transformer = get_full_to_zrm_transformmer()