예제 #1
0
def cols_to_tangshi_item(cols: List[str], xhe_transformer, zrm_transformer,
                         bingji_transformer, lu_transformer) -> TangshiTable:
    if len(cols) == 1:
        word = cols[0]
        priority = 100
        full = get_full(word)
    elif len(cols) == 2:
        word = cols[0]
        priority = cols[1]
        full = get_full(word)
    elif len(cols) == 2 + len(cols[0]):
        word = cols[0]
        priority = cols[1]
        full = list(filter(lambda e: len(e) > 0,
                           [e.strip() for e in cols[2:]]))
    else:
        raise RuntimeError("word item should be: 你好 [priority ni hao]")

    item = TangshiTable(
        word=word,
        full=' '.join(full),
        xhe=''.join([full_to_two(e, xhe_transformer) for e in full]),
        zrm=''.join([full_to_two(e, zrm_transformer) for e in full]),
        lu=''.join([full_to_two(e, lu_transformer) for e in full]),
        priority=priority,
        updatedt=datetime.now(),
        bingji=''.join(
            full_to_two(e, bingji_transformer, bingji=True) for e in full))
    print("add ", item)
    return item
예제 #2
0
def cols_to_word_phone_table(cols: List[str], xhe_transformer, zrm_transformer) -> WordPhoneTable:
    if len(cols) == 1:
        word = cols[0]
        priority = 1
        full = get_full(word)
    elif len(cols) == 2:
        word = cols[0]
        priority = cols[1]
        full = get_full(word)
    elif len(cols) == 2 + len(cols[0]):
        word = cols[0]
        priority = cols[1]
        full = list(filter(lambda e: len(e) > 0, [e.strip() for e in cols[2:]]))
    else:
        raise RuntimeError("word item should be: 你好 [priority n i h ao]")

    return WordPhoneTable(
        word=word, 
        full=''.join(full),
        xhe=''.join([full_to_two(e, xhe_transformer) for e in full]),
        zrm=''.join([full_to_two(e, zrm_transformer) for e in full]),
        lu="",
        priority=priority, 
        updatedt=datetime.now()
    )
예제 #3
0
def update_word_full(char_phones: Dict[str, List[str]]):
    to_update_items = []
    for item in WordPhoneTable.select():
        words: str = item.word
        full: str = item.full
        if len(words) == len(full.split(' ')):
            continue
        if full == '':
            full = ' '.join(get_full(words))
            item.full = full
            to_update_items.append(item)
            continue

        words_candidate_fulls: List[List[str]] = []
        for char in words:
            if char not in char_phones:
                print(f"{char} not in phone table")
                continue
                # FIXME:
                # raise RuntimeError(f"{char} in phone table")
            else:
                words_candidate_fulls.append(
                    sorted(char_phones[char], key=lambda e: -len(e)))
        full_arr: List[Tuple[List[str], str]] = []
        for word_candidate_fulls in words_candidate_fulls:
            if len(full_arr) <= 0:  # 第一个字
                for candidate_full in word_candidate_fulls:
                    if full.startswith(candidate_full):
                        full_arr.append(
                            ([candidate_full], full[len(candidate_full):]))
            else:
                broken_segments = []
                this_full_arr: List[Tuple[List[str], str]] = []
                for pre_segment in full_arr:
                    next_full = pre_segment[1]
                    for candidate_full in word_candidate_fulls:
                        if next_full.startswith(candidate_full):
                            this_segments = []
                            this_segments.extend(pre_segment[0])
                            this_segments.append(candidate_full)
                            this_next_full = next_full[len(candidate_full):]
                            this_full_arr.append(
                                (this_segments, this_next_full))
                full_arr = this_full_arr
        full_arr = [e for e in full_arr if len(e[0]) > 0 and e[1] == '']
        if len(full_arr) != 1:
            print(f"wrong format: {item}, {full_arr}")
            # FIXME:
            # raise RuntimeError(f"get full pinyin fails, {item}")
        else:
            item.full = ' '.join(full_arr[0][0])
            to_update_items.append(item)

    if len(to_update_items) > 0:
        print(f"total have {len(to_update_items)} items to update")
        with db.atomic():
            WordPhoneTable.bulk_update(to_update_items,
                                       fields=['full'],
                                       batch_size=100)
    print("done")
예제 #4
0
def cols_to_word_phone_table(cols: List[str], xhe_transformer, zrm_transformer,
                             bingji_transformer,
                             lu_transformer) -> Union[WordPhoneTable, None]:
    if len(cols) == 1:
        word = cols[0]
        priority = 100
        try:
            full = get_full(word)
        except Exception as e:
            print(e)
            return None
    # elif len(cols) == 2:
    #     word = cols[0]
    #     priority = cols[1]
    #     full = get_full(word)
    elif len(cols) == 1 + len(cols[0]):
        word = cols[0]
        priority = 100
        full = list(filter(lambda e: len(e) > 0,
                           [e.strip() for e in cols[1:]]))
    elif len(cols) == 2 + len(cols[0]):
        word = cols[0]
        priority = int(cols[-1])
        full = list(
            filter(lambda e: len(e) > 0,
                   [e.strip() for e in cols[1:len(cols)]]))
    else:
        raise RuntimeError("word item should be: 你好 [ni hao 100]")

    item = WordPhoneTable(
        word=word,
        full=' '.join(full),
        xhe=''.join([full_to_two(e, xhe_transformer) for e in full]),
        zrm=''.join([full_to_two(e, zrm_transformer) for e in full]),
        lu=''.join([full_to_two(e, lu_transformer) for e in full]),
        priority=priority,
        updatedt=datetime.now(),
        bingji=''.join(
            full_to_two(e, bingji_transformer, bingji=True) for e in full))
    print("add ", item)
    return item
예제 #5
0
def fill_full(item: WordPhoneTable) -> WordPhoneTable:
    full = ''.join(get_full(item.word))
    item.full = full
    return item
예제 #6
0
            for_each(lambda e: print(e)),
        )
        print(f"null phones item is: {len(null_phones_items)}")
        sys.exit(1)

    del null_phones_items

    null_full_items = pipe(
        CharPhoneTable.select().where(CharPhoneTable.full == ''),
        list,
    )
    if len(null_full_items) != 0:
        print(f"null full items is {len(null_full_items)}")
        pipe(
            null_full_items,
            map(lambda e: (e, ''.join(get_full(e.char)))),
            map(lambda e: update_full(e[0], e[1])),
            for_each(lambda e: e.save()),
        )
    del null_full_items

    full_to_xhe_transformer = get_full_to_xhe_transformer()
    xhe_full_neq_items = pipe(
        CharPhoneTable.select(),
        filter(lambda e: e.phones != full_to_two(e.full,
                                                 full_to_xhe_transformer)),
        list,
    )
    if len(xhe_full_neq_items) != 0:
        print(f"xhe full not equal len is {len(xhe_full_neq_items)}")
예제 #7
0
def load_chars(filepath: str):
    exist_charpinyins = common.get_exists_charyinpins()

    chars = []
    shapes = []
    with open(filepath, 'r', encoding='utf8') as fin:
        for line in fin:
            line = line.strip()
            if len(line) == 0: continue
            cols = line.split(" ")
            if len(cols) < 2:
                print(f"{line} broken")
                continue

            pinyin = None
            priority = 1
            if len(cols) == 2:
                char = cols[0]
                shape = cols[1]
            elif len(cols) == 3:
                char = cols[0]
                shape = cols[1]
                pinyin = cols[2]
            elif len(cols) == 4:
                char = cols[0]
                shape = cols[1]
                pinyin = cols[2]
                priority = int(cols[3])
            else:
                print(f"broken line {line}")
                continue

            if common.contain_alpha(word=char) or common.contain_alpha(word=char):
                print(f"broken line {line}")
                continue
            if len(char) != 1:
                print(f"broken line {line}")
                continue
            if shape is None or not shape.isalpha():
                print(f"broken line {line}")
                continue
            if pinyin is not None and not pinyin.isalpha():
                print(f"broken line {line}")
                continue
            if pinyin is None:
                pinyin = ''.join(common.get_full(char))
            if priority is None or priority < 1:
                priority = 1

            if char + pinyin in exist_charpinyins:
                print(f"already exists {line}")
                continue
            exist_charpinyins.add(char+pinyin)
            chars.append(tables.CharPhoneTable(
                char=char,
                full=pinyin,
                xhe='',
                lu='',
                zrm='',
                bingji='',
                priority=priority,
                updatedt=datetime.now(),
            ))
            shapes.append(tables.CharHeShapeTable(
                char=char,
                shapes=shape,
                priority=priority,
                updatedt=datetime.now(),
            ))

    with tables.db.atomic():
        tables.CharHeShapeTable.bulk_create(shapes, batch_size=100)
    print(f"add he shape: {shapes}")
    print(f"add he shape num: {len(shapes)}")

    with tables.db.atomic():
        tables.CharPhoneTable.bulk_create(chars, batch_size=100)
    print(f"add char phone: {chars}")
    print(f"add char phone num: {len(chars)}")