Beispiel #1
0
def update_word_full(char_phones: Dict[str, List[str]]):
    to_update_items = []
    for item in WordPhoneTable.select():
        words: str = item.word
        full: str = item.full
        if len(words) == len(full.split(' ')):
            continue
        if full == '':
            full = ' '.join(get_full(words))
            item.full = full
            to_update_items.append(item)
            continue

        words_candidate_fulls: List[List[str]] = []
        for char in words:
            if char not in char_phones:
                print(f"{char} not in phone table")
                continue
                # FIXME:
                # raise RuntimeError(f"{char} in phone table")
            else:
                words_candidate_fulls.append(
                    sorted(char_phones[char], key=lambda e: -len(e)))
        full_arr: List[Tuple[List[str], str]] = []
        for word_candidate_fulls in words_candidate_fulls:
            if len(full_arr) <= 0:  # 第一个字
                for candidate_full in word_candidate_fulls:
                    if full.startswith(candidate_full):
                        full_arr.append(
                            ([candidate_full], full[len(candidate_full):]))
            else:
                broken_segments = []
                this_full_arr: List[Tuple[List[str], str]] = []
                for pre_segment in full_arr:
                    next_full = pre_segment[1]
                    for candidate_full in word_candidate_fulls:
                        if next_full.startswith(candidate_full):
                            this_segments = []
                            this_segments.extend(pre_segment[0])
                            this_segments.append(candidate_full)
                            this_next_full = next_full[len(candidate_full):]
                            this_full_arr.append(
                                (this_segments, this_next_full))
                full_arr = this_full_arr
        full_arr = [e for e in full_arr if len(e[0]) > 0 and e[1] == '']
        if len(full_arr) != 1:
            print(f"wrong format: {item}, {full_arr}")
            # FIXME:
            # raise RuntimeError(f"get full pinyin fails, {item}")
        else:
            item.full = ' '.join(full_arr[0][0])
            to_update_items.append(item)

    if len(to_update_items) > 0:
        print(f"total have {len(to_update_items)} items to update")
        with db.atomic():
            WordPhoneTable.bulk_update(to_update_items,
                                       fields=['full'],
                                       batch_size=100)
    print("done")
Beispiel #2
0
def check_wordphonetable_pinyin(transformer: Dict[str, str], schema: ShuangPinSchema):
    to_update_items = []
    for item in WordPhoneTable.select():
        fulls = item.full
        if schema == XHE_SP_SCHEMA:
            shuangpin = item.xhe
        elif schema == LU_SP_SCHEMA:
            shuangpin = item.lu
        elif schema == ZRM_SP_SCHEMA:
            shuangpin = item.zrm
        elif schema == BINGJI_SP_SCHEMA:
            shuangpin = item.bingji
        else:
            raise RuntimeError(f'unknown schema: {schema}')

        full_shuangpins_arr = []
        for full in fulls.split(' '):
            s, y = split_sy(full)
            sp = transformer[s] + transformer[y]
            full_shuangpins_arr.append(sp)
        full_shuangpins = ''.join(full_shuangpins_arr)
        if full_shuangpins != shuangpin:
            if schema == XHE_SP_SCHEMA:
                item.xhe = full_shuangpins
            elif schema == LU_SP_SCHEMA:
                item.lu = full_shuangpins
            elif schema == ZRM_SP_SCHEMA:
                item.zrm = full_shuangpins
            elif schema == BINGJI_SP_SCHEMA:
                item.bingji = full_shuangpins
            else:
                raise RuntimeError(f'unknown schema: {schema}')
            to_update_items.append(item)

    with db.atomic():
        if schema == XHE_SP_SCHEMA:
            WordPhoneTable.bulk_update(to_update_items,
                                       fields=['xhe'],
                                       batch_size=100)
        elif schema == LU_SP_SCHEMA:
            WordPhoneTable.bulk_update(to_update_items,
                                       fields=['lu'],
                                       batch_size=100)
        elif schema == ZRM_SP_SCHEMA:
            WordPhoneTable.bulk_update(to_update_items,
                                       fields=['zrm'],
                                       batch_size=100)
        elif schema == BINGJI_SP_SCHEMA:
            WordPhoneTable.bulk_update(to_update_items,
                                       fields=['bingji'],
                                       batch_size=100)
        else:
            raise RuntimeError(f'unknown schema: {schema}')

    print(to_update_items)
    print(f'update {len(to_update_items)} wordphonetable items')
Beispiel #3
0
def cols_to_word_phone_table(cols: List[str], xhe_transformer, zrm_transformer) -> WordPhoneTable:
    if len(cols) == 1:
        word = cols[0]
        priority = 1
        full = get_full(word)
    elif len(cols) == 2:
        word = cols[0]
        priority = cols[1]
        full = get_full(word)
    elif len(cols) == 2 + len(cols[0]):
        word = cols[0]
        priority = cols[1]
        full = list(filter(lambda e: len(e) > 0, [e.strip() for e in cols[2:]]))
    else:
        raise RuntimeError("word item should be: 你好 [priority n i h ao]")

    return WordPhoneTable(
        word=word, 
        full=''.join(full),
        xhe=''.join([full_to_two(e, xhe_transformer) for e in full]),
        zrm=''.join([full_to_two(e, zrm_transformer) for e in full]),
        lu="",
        priority=priority, 
        updatedt=datetime.now()
    )
Beispiel #4
0
def main():
    if len(sys.argv) != 2:
        print(f"使用方法: python3 {sys.argv[0]} words.txt", file=sys.stderr)
        print("文件行格式:word [w1_yin w2_yin ... prioroty]")
        print("举例:你好 [ni hao 100]")
        print("中括号内为可选内容")
        sys.exit(1)

    _, words_path = sys.argv

    add_words = load_words(words_path)
    print(add_words)
    with db.atomic():
        WordPhoneTable.bulk_create(add_words, batch_size=100)

    print(f'done, add {len(add_words)} items')
Beispiel #5
0
def get_exists_words() -> Set[str]:
    exist_words = set()

    exist_words.union(get_exists_chars())

    for e in WordPhoneTable.select():
        exist_words.add(e.word)

    for e in TangshiTable.select():
        exist_words.add(e.word)

    return exist_words
Beispiel #6
0
def cols_to_word_phone_table(cols: List[str], xhe_transformer, zrm_transformer,
                             bingji_transformer,
                             lu_transformer) -> Union[WordPhoneTable, None]:
    if len(cols) == 1:
        word = cols[0]
        priority = 100
        try:
            full = get_full(word)
        except Exception as e:
            print(e)
            return None
    # elif len(cols) == 2:
    #     word = cols[0]
    #     priority = cols[1]
    #     full = get_full(word)
    elif len(cols) == 1 + len(cols[0]):
        word = cols[0]
        priority = 100
        full = list(filter(lambda e: len(e) > 0,
                           [e.strip() for e in cols[1:]]))
    elif len(cols) == 2 + len(cols[0]):
        word = cols[0]
        priority = int(cols[-1])
        full = list(
            filter(lambda e: len(e) > 0,
                   [e.strip() for e in cols[1:len(cols)]]))
    else:
        raise RuntimeError("word item should be: 你好 [ni hao 100]")

    item = WordPhoneTable(
        word=word,
        full=' '.join(full),
        xhe=''.join([full_to_two(e, xhe_transformer) for e in full]),
        zrm=''.join([full_to_two(e, zrm_transformer) for e in full]),
        lu=''.join([full_to_two(e, lu_transformer) for e in full]),
        priority=priority,
        updatedt=datetime.now(),
        bingji=''.join(
            full_to_two(e, bingji_transformer, bingji=True) for e in full))
    print("add ", item)
    return item
def fill_lu(item: WordPhoneTable, lu: str) -> WordPhoneTable:
    item.lu = lu
    return item
        fout.write(f"---config@码表别名=系统单字\n")
        pipe(
            CharPhoneTable.select().order_by(CharPhoneTable.priority.desc()),
            filter(lambda e: e.char in char_to_shape),
            map(lambda e: f"{e.char}\t{e.zrm+char_to_shape[e.char]}#序40000"),
            for_each(lambda e: fout.write(e + '\n')),
        )

    del_words = pipe(DelWordTable.select(), map(lambda e: e.word), set)
    sys_word_data = f"{output_dir}/sys_word_data.txt"
    with open(sys_word_data, 'w', encoding='utf8') as fout:
        fout.write("---config@码表分类=主码-2\n")
        fout.write("---config@允许编辑=否\n")
        fout.write(f"---config@码表别名=系统词组\n")
        pipe(
            WordPhoneTable.select().order_by(fn.LENGTH(WordPhoneTable.word),
                                             WordPhoneTable.priority.desc()),
            filter(lambda e: e.word not in del_words),
            map(lambda e: (f'{e.word}\t{e.zrm}', e.word[0], e.word[-1])),
            filter(lambda e: e[1] in char_to_shape and e[2] in char_to_shape),
            map(lambda e:
                f'{e[0]}{char_to_shape[e[1]][0]}{char_to_shape[e[2]][-1]}#序20000'
                ), for_each(lambda e: fout.write(e + '\n')))

    with open(f'{output_dir}/sys_eng_data.txt', 'w', encoding='utf8') as fout:
        fout.write("---config@码表分类=主码-3\n")
        fout.write("---config@允许编辑=否\n")
        fout.write(f"---config@码表别名=系统英文\n")
        pipe(
            EngWordTable.select().where(EngWordTable.priority > 100).order_by(
                fn.LENGTH(EngWordTable.word), EngWordTable.priority),
            filter(lambda e: is_all_alpha(e.word)),
Beispiel #9
0

def mean(lst: List[int]) -> int:
    if len(lst) == 0:
        return 1
    else:
        return int(sum(lst) / len(lst))


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print(f"Usage: python3 {sys.argv[0]} sents.txt", file=sys.stderr)
        sys.exit(1)
    _, sents_path = sys.argv

    exist_words = pipe(WordPhoneTable.select(), map(lambda e: e.word), set)
    seg = Segger(exist_words, 5)

    with open(sents_path, 'r', encoding='utf8') as fin:
        word_freq = pipe(
            fin, map(lambda e: e.strip().replace(" ", "").replace("\t", "")),
            filter(lambda e: e != "" and not e.startswith("#")),
            map(lambda e: seg.cut(e)), concat, groupby(lambda e: e),
            valmap(lambda e: len(e)), dict)

    index = 0
    for item in WordPhoneTable.select():
        index += 1
        if index == 1000:
            print(item)
            index = 0
Beispiel #10
0
    #with open(sents_path, 'r', encoding='utf8') as fin:
    #    word_freq = pipe(
    #        fin, map(lambda e: e.strip().replace(" ", "").replace("\t", "")),
    #        filter(lambda e: e != "" and not e.startswith("#")),
    #        map(lambda e: seg.cut(e)), concat, groupby(lambda e: e),
    #        valmap(lambda e: len(e)), dict)

    chars_freq = {}
    for item in CharFreqTable.select():
        if item.char in chars_freq:
            raise ("duplicated " + item.char)
        chars_freq[item.char] = item.freq

    index = 0
    tosave_items = []
    for item in WordPhoneTable.select().where(WordPhoneTable.priority <= 0):
        index += 1
        if index == 10000:
            print(item)
            index = 0
            with db.atomic():
                WordPhoneTable.bulk_update(tosave_items,
                                           [WordPhoneTable.priority],
                                           batch_size=200)
            tosave_items.clear()

        word = item.word
        #if word in word_freq:
        #    freq = word_freq[word]
        #else:
        #    freq = 1
Beispiel #11
0
        [tuple(e.split("\t")) for e in generate_one_hit_char(60000).keys()])
    all_items.extend([
        tuple(e.split("\t"))
        for e in generate_topest_char(char_to_phones, 60000)
    ])

    #系统单字部分
    all_items.extend(
        pipe(CharPhoneTable.select(),
             filter(lambda e: e.char in char_to_shape),
             map(lambda e: (e.char, f"{e.xhe+char_to_shape[e.char]}")), list))

    del_words = pipe(DelWordTable.select(), map(lambda e: e.word), set)
    all_items.extend(
        pipe(
            WordPhoneTable.select(), filter(lambda e: e.word not in del_words),
            map(lambda e: (e.word, e.xhe, e.word[0], e.word[-1])),
            filter(lambda e: e[2] in char_to_shape and e[3] in char_to_shape),
            map(lambda e: (e[0], e[1] + char_to_shape[e[2]][0] + char_to_shape[
                e[3]][-1])), list))

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    with open(output_dir + "/xiaolu_word_for_baidu.ini", 'w',
              encoding='utf8') as fout:

        for key, value in groupby(lambda e: e[1],
                                  sorted(all_items, key=lambda e:
                                         (e[1]))).items():
            for i in range(len(value)):
                fout.write(f"{value[i][1]}={i+1},{value[i][0]}\n")
Beispiel #12
0
from toolz.curried import pipe, map
from tables import db, DelWordTable, WordPhoneTable

if __name__ == "__main__":

    del_words = pipe(DelWordTable.select(), map(lambda e: e.word), set)

    num = WordPhoneTable.select().where(
        WordPhoneTable.word.in_(del_words)).count()
    print(f"total {num} items to delete")
    WordPhoneTable.delete().where(WordPhoneTable.word.in_(del_words)).execute()

    print("done")
Beispiel #13
0
                        word_phones.append((word, f"{c1}{c2}"))
            elif len(phones) == 3:
                for c1 in phones[0]:
                    for c2 in phones[1]:
                        for c3 in phones[2]:
                            word_phones.append((word, f"{c1}{c2}{c3}"))
            else:
                print(f"{word} {phones} lenght great than 3, exiting...")
                sys.exit(1)
            
        to_add_items = [] 
        exist_items = set()
        for (word, phones) in word_phones:
            if f"{word}{phones}" in exist_items:
                continue
            if len(phones) != len(word)*2:
                print(f"D: {word} {phones} wrong.")
                continue
            num = WordPhoneTable.select().where(WordPhoneTable.word == word, WordPhoneTable.phones == phones).count()
            if num > 0:
                continue
            to_add_items.append(WordPhoneTable(word=word, phones=phones, priority=1, updatedt=datetime.now()))
            exist_items.add(f"{word}{phones}")
            # WordPhoneTable(word=word, phones=phones, priority=1, updatedt=datetime.now()).save()
        print(f"add length {len(to_add_items)}")
        with db.atomic():
            WordPhoneTable.bulk_create(to_add_items, batch_size=100)
    print('done')
    pass

Beispiel #14
0
            '[1234567890’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~,。!@#$%^&*………_+}{}]+',
            word) is None:
        return False
    else:
        return True


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print(f"USAGE: python3 {sys.argv[0]} words.txt", file=sys.stderr)
        sys.exit(1)

    _, words_path = sys.argv

    exist_words = set()
    exist_words = pipe(WordPhoneTable.select(), map(lambda e: e.word), set)

    exist_words = exist_words | pipe(DelWordTable.select(),
                                     map(lambda e: e.word), set)

    with open(words_path, "r", encoding='utf8') as fin:

        #FIXME: bug to fix, we have more phone type now.
        ft_dict = get_double_dict()

        to_add_words = pipe(
            fin,
            map(lambda e: e.strip().split('\t')),
            filter(lambda e: len(e) in (1, 2)),
            filter(lambda e: len(e[0]) <= 5),
            filter(lambda e: not contain_alpha(e[0]) and not contain_symbols(e[
Beispiel #15
0
    if re.match('[1234567890’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~,。!@#$%^&*………_+}{}]+', word) is None:
        return False
    else:
        return True


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print(f"USAGE: python3 {sys.argv[0]} words.txt", file=sys.stderr)
        print("words format:word prioroty w1_yin w2_yin ...")
        sys.exit(1)

    _, words_path = sys.argv

    exist_words = set()
    exist_words = pipe(WordPhoneTable.select(),
                       map(lambda e: e.word),
                       set
                       )

    exist_words = exist_words | pipe(DelWordTable.select(),
                                     map(lambda e: e.word),
                                     set
                                     )

    xhe_transformer = get_full_to_xhe_transformer();
    zrm_transformer = get_full_to_zrm_transformmer();
    lu_transformer = get_full_to_lu_transformmer();

    with open(words_path, "r", encoding='utf8') as fin:
        to_add_words = pipe(fin,
Beispiel #16
0
            word) is None:
        return False
    else:
        return True


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print(f"USAGE: python3 {sys.argv[0]} words.txt", file=sys.stderr)
        print("words format:word prioroty w1_yin w2_yin ...")
        sys.exit(1)

    _, words_path = sys.argv

    exist_words = set()
    exist_words = pipe(WordPhoneTable.select(), map(lambda e: e.word), set)

    exist_words = exist_words | pipe(DelWordTable.select(),
                                     map(lambda e: e.word), set)

    xhe_transformer = get_full_to_xhe_transformer()
    zrm_transformer = get_full_to_zrm_transformmer()
    lu_transformer = get_full_to_lu_transformmer()

    with open(words_path, "r", encoding='utf8') as fin:
        to_add_words = pipe(
            fin, map(lambda e: e.strip().split(' ')),
            filter(lambda e: len(e) in (1, 2)),
            filter(lambda e: len(e[0]) <= 5),
            filter(lambda e: not contain_alpha(e[0]) and not contain_symbols(e[
                0])), filter(lambda e: e[0] not in exist_words),
def fill_full(item: WordPhoneTable) -> WordPhoneTable:
    full = ''.join(get_full(item.word))
    item.full = full
    return item
def fill_zrm(item: WordPhoneTable, zrm: str) -> WordPhoneTable:
    item.zrm = zrm
    return item
def fill_xhe(item: WordPhoneTable, xhe: str) -> WordPhoneTable:
    item.xhe = xhe
    return item

def fill_zrm(item: WordPhoneTable, zrm: str) -> WordPhoneTable:
    item.zrm = zrm
    return item


def fill_lu(item: WordPhoneTable, lu: str) -> WordPhoneTable:
    item.lu = lu
    return item


if __name__ == "__main__":

    print("check full")
    to_update_full_items = pipe(WordPhoneTable.select().where(WordPhoneTable.full == ""),
        map(lambda e: fill_full(e)),
    )
    with db.atomic():
        WordPhoneTable.bulk_update(to_update_full_items, fields=['full'], batch_size=100)
    del to_update_full_items

    print("check xhe")
    full_to_xhe_transformer = get_full_to_xhe_transformer()
    to_update_xhe_items = pipe(WordPhoneTable.select().where(WordPhoneTable.xhe == ""),
        map(lambda e: (e, word_to_two(e.word, full_to_xhe_transformer))),
        map(lambda e: fill_xhe(e[0], e[1])),
    )
    with db.atomic():
        WordPhoneTable.bulk_update(to_update_xhe_items, fields=['xhe'], batch_size=100)
    del to_update_xhe_items
Beispiel #21
0

def mean(lst: List[int]) -> int:
    if len(lst) == 0:
        return 1
    else:
        return int(sum(lst)/len(lst))
    

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print(f"Usage: python3 {sys.argv[0]} sents.txt", file=sys.stderr)
        sys.exit(1)
    _, sents_path = sys.argv

    exist_words = pipe(WordPhoneTable.select(),
        map(lambda e: e.word),
        set
    )
    seg = Segger(exist_words, 5)

    with open(sents_path, 'r', encoding='utf8') as fin:
        word_freq = pipe(fin,
            map(lambda e: e.strip().replace(" ", "").replace("\t", "")),
            filter(lambda e: e != "" and not e.startswith("#")),
            map(lambda e: seg.cut(e)),
            concat,
            groupby(lambda e: e),
            valmap(lambda e: len(e)),
            dict
        )
Beispiel #22
0
def full_to_double(pinyin, full_to_two):
    return [full_to_two[e[0]] + full_to_two[e[1]] for e in pinyin]


def get_double_dict():

    full_to_two = {}
    for item in FullToTwoTable.select():
        if item.full in full_to_two:
            print(f"ERROR in {item.full}")
            sys.exit(1)
        else:
            full_to_two[item.full] = item.two
    return full_to_two


if __name__ == "__main__":
    full_to_two = get_double_dict()

    for item in WordPhoneTable.select():
        word = item.word
        phones = item.phones
        pinyin = [split_sy(e) for e in lazy_pinyin(word)]
        # print(word, phones, pinyin)
        double = ''.join(full_to_double(pinyin, full_to_two))
        if phones != double:
            print(f"diff in {item.id}, {word}, {phones}, {double}")
            item.delete_instance()

    print("done")