コード例 #1
0
def update_word_full(char_phones: Dict[str, List[str]]):
    to_update_items = []
    for item in WordPhoneTable.select():
        words: str = item.word
        full: str = item.full
        if len(words) == len(full.split(' ')):
            continue
        if full == '':
            full = ' '.join(get_full(words))
            item.full = full
            to_update_items.append(item)
            continue

        words_candidate_fulls: List[List[str]] = []
        for char in words:
            if char not in char_phones:
                print(f"{char} not in phone table")
                continue
                # FIXME:
                # raise RuntimeError(f"{char} in phone table")
            else:
                words_candidate_fulls.append(
                    sorted(char_phones[char], key=lambda e: -len(e)))
        full_arr: List[Tuple[List[str], str]] = []
        for word_candidate_fulls in words_candidate_fulls:
            if len(full_arr) <= 0:  # 第一个字
                for candidate_full in word_candidate_fulls:
                    if full.startswith(candidate_full):
                        full_arr.append(
                            ([candidate_full], full[len(candidate_full):]))
            else:
                broken_segments = []
                this_full_arr: List[Tuple[List[str], str]] = []
                for pre_segment in full_arr:
                    next_full = pre_segment[1]
                    for candidate_full in word_candidate_fulls:
                        if next_full.startswith(candidate_full):
                            this_segments = []
                            this_segments.extend(pre_segment[0])
                            this_segments.append(candidate_full)
                            this_next_full = next_full[len(candidate_full):]
                            this_full_arr.append(
                                (this_segments, this_next_full))
                full_arr = this_full_arr
        full_arr = [e for e in full_arr if len(e[0]) > 0 and e[1] == '']
        if len(full_arr) != 1:
            print(f"wrong format: {item}, {full_arr}")
            # FIXME:
            # raise RuntimeError(f"get full pinyin fails, {item}")
        else:
            item.full = ' '.join(full_arr[0][0])
            to_update_items.append(item)

    if len(to_update_items) > 0:
        print(f"total have {len(to_update_items)} items to update")
        with db.atomic():
            WordPhoneTable.bulk_update(to_update_items,
                                       fields=['full'],
                                       batch_size=100)
    print("done")
コード例 #2
0
def check_tangshitable_pinyin(transformer: Dict[str, str], schema: ShuangPinSchema):
    to_update_items = []
    for item in TangshiTable.select():
        fulls = item.full
        if schema == XHE_SP_SCHEMA:
            shuangpin = item.xhe
        elif schema == LU_SP_SCHEMA:
            shuangpin = item.lu
        elif schema == ZRM_SP_SCHEMA:
            shuangpin = item.zrm
        elif schema == BINGJI_SP_SCHEMA:
            shuangpin = item.bingji
        else:
            raise RuntimeError(f'unknown schema: {schema}')

        full_shuangpins_arr = []
        for full in fulls.split(' '):
            s, y = split_sy(full)
            sp = transformer[s] + transformer[y]
            full_shuangpins_arr.append(sp)
        full_shuangpins = ''.join(full_shuangpins_arr)
        if full_shuangpins != shuangpin:
            if schema == XHE_SP_SCHEMA:
                item.xhe = full_shuangpins
            elif schema == LU_SP_SCHEMA:
                item.lu = full_shuangpins
            elif schema == ZRM_SP_SCHEMA:
                item.zrm = full_shuangpins
            elif schema == BINGJI_SP_SCHEMA:
                item.bingji = full_shuangpins
            else:
                raise RuntimeError(f'unknown schema: {schema}')
            to_update_items.append(item)

    with db.atomic():
        if schema == XHE_SP_SCHEMA:
            TangshiTable.bulk_update(to_update_items,
                                       fields=['xhe'],
                                       batch_size=100)
        elif schema == LU_SP_SCHEMA:
            TangshiTable.bulk_update(to_update_items,
                                       fields=['lu'],
                                       batch_size=100)
        elif schema == ZRM_SP_SCHEMA:
            TangshiTable.bulk_update(to_update_items,
                                       fields=['zrm'],
                                       batch_size=100)
        elif schema == BINGJI_SP_SCHEMA:
            TangshiTable.bulk_update(to_update_items,
                                       fields=['bingji'],
                                       batch_size=100)
        else:
            raise RuntimeError(f'unknown schema: {schema}')

    print(to_update_items)
    print(f'update {len(to_update_items)} tangshitable items')
コード例 #3
0
def check_chars_pinyin(transformer: Dict[str, str], schema: ShuangPinSchema):
    to_update_items = []
    for item in CharPhoneTable.select():
        full = item.full
        if schema == XHE_SP_SCHEMA:
            shuangpin = item.xhe
        elif schema == LU_SP_SCHEMA:
            shuangpin = item.lu
        elif schema == ZRM_SP_SCHEMA:
            shuangpin = item.zrm
        elif schema == BINGJI_SP_SCHEMA:
            shuangpin = item.bingji
        else:
            raise RuntimeError(f"unkonwn schame {schema}")
        s, y = split_sy(full)
        sp = transformer[s] + transformer[y]
        if shuangpin != sp:
            if schema == XHE_SP_SCHEMA:
                item.xhe = sp
            elif schema == LU_SP_SCHEMA:
                item.lu = sp
            elif schema == ZRM_SP_SCHEMA:
                item.zrm = sp
            elif schema == BINGJI_SP_SCHEMA:
                item.bingji = sp
            else:
                raise RuntimeError(f"unkonwn schame {schema}")
            to_update_items.append(item)

    with db.atomic():
        if schema == XHE_SP_SCHEMA:
            CharPhoneTable.bulk_update(to_update_items,
                                       fields=['xhe'],
                                       batch_size=100)
        elif schema == LU_SP_SCHEMA:
            CharPhoneTable.bulk_update(to_update_items,
                                       fields=['lu'],
                                       batch_size=100)
        elif schema == ZRM_SP_SCHEMA:
            CharPhoneTable.bulk_update(to_update_items,
                                       fields=['zrm'],
                                       batch_size=100)
        elif schema == BINGJI_SP_SCHEMA:
            CharPhoneTable.bulk_update(to_update_items,
                                       fields=['bingji'],
                                       batch_size=100)
        else:
            raise RuntimeError(f"unkonwn schame {schema}")

    print(to_update_items)
    print(f'update {len(to_update_items)} char items')
コード例 #4
0
def main():
    if len(sys.argv) != 2:
        print(f"使用方法: python3 {sys.argv[0]} words.txt", file=sys.stderr)
        print("文件行格式:word [prioroty w1_yin w2_yin ...]")
        print("举例:你好 [10 ni hao]")
        print("中括号内为可选内容")
        sys.exit(1)

    _, words_path = sys.argv

    add_words = load_words(words_path)
    print(add_words)
    with db.atomic():
        TangshiTable.bulk_create(add_words, batch_size=100)

    print(f'done, add {len(add_words)} items')
コード例 #5
0
    #        valmap(lambda e: len(e)), dict)

    chars_freq = {}
    for item in CharFreqTable.select():
        if item.char in chars_freq:
            raise ("duplicated " + item.char)
        chars_freq[item.char] = item.freq

    index = 0
    tosave_items = []
    for item in WordPhoneTable.select().where(WordPhoneTable.priority <= 0):
        index += 1
        if index == 10000:
            print(item)
            index = 0
            with db.atomic():
                WordPhoneTable.bulk_update(tosave_items,
                                           [WordPhoneTable.priority],
                                           batch_size=200)
            tosave_items.clear()

        word = item.word
        #if word in word_freq:
        #    freq = word_freq[word]
        #else:
        #    freq = 1

        freqs = [(chars_freq[word[e]] if word[e] in chars_freq else 10)
                 for e in range(len(word))]
        # print(freqs)
        priority = get_priority(freqs)