Esempio n. 1
0
 def tag(self):
     while True:
         tags=[]
         intag=input('请输入要爬取的标签,多个标签请用&&分割: ')
         if intag.lower()=='q':sys.exit()
         if ('&&' in intag)==True:tags=intag.split('&&')
         else:tags.append(intag)
         empty=0
         for i in range(len(tags)):
             print('<{}/{}>正在解析标签[{}]'.format(str(i+1),str(len(tags)),tags[i]))
             load_phrases_dict(pydict)
             tags[i]=''.join(lazy_pinyin(tags[i]))
             url=self.host+'/t/'+tags[i]+'/'
             html=httpget(url)
             if html ==None:
                 empty+=1
                 print('标签[{}]不存在!'.format(tags[i]))
             else:
                 ehtml=etree.HTML(html)
                 page=ehtml.xpath("//div[@id='pages']/a[last()-1]/text()")[0]
                 result=ehtml.xpath("//ul[@class='img']/li")
                 if page!='1':
                     p=get_input(int(page),'请输入要解析的页数[总页数{}]: '.format(page))
                     if p!=1:
                         pbar=tqdm.tqdm(range(2,p+1),desc='解析进度',ncols=80)
                         for i1 in pbar:
                             result+=self.get_results(url+str(i1)+'.html')
                     print('标签[{}]下共找到图集{}个'.format(tags[i],len(result)))
                 else:
                     print('标签[{}]下共找到图集{}个'.format(tags[i],len(result)))
                 self.results+=result
         if empty==len(tags):print(colorama.Back.RED+'您输入的标签均不存在,请重试!')
         else:
             print(colorama.Fore.GREEN+'所有标签下共找到图集{}个'.format(len(self.results)))
             break
Esempio n. 2
0
def test_retrain():
    seg = mmseg.seg
    assert list(seg.cut('啊啊啊')) == ['啊', '啊', '啊']

    load_phrases_dict({'啊啊啊': [['a'], ['a'], ['a']]})
    mmseg.retrain(seg)
    assert list(seg.cut('啊啊啊')) == ['啊啊啊']
Esempio n. 3
0
    def __init__(self,pyfile = None,sil_mode = 0):
        '''

        :param pyfile:
        :param use_pinyin: 是否使用pinyin 库注音
        :param sil_mode: 决定静音音素的标识,0代表第一个,1代表最后一个,-1代表考虑ctc解码,结尾一个,-1 1个
        '''
        if pyfile is None:
            pyfile = config.py_dict_path
        self.py_file = pyfile
        self.max_index = None  # 这个max_index 是包含了代码添加的blank后的max_index,num_py_map[max_index] = <blank>
        self.sil_mode = sil_mode

        self.use_pinyin = True #use_pinyin 该选项在之后会废弃,永久使用pinyin库注拼音


        # 这里是发现的一些错误,暂时用硬编码解决了
        change_dict = {
            "茸": [["rong2"]],
            "蓉": [["rong2"]],
            "嗯": [["en1"]],
            "哦": [["ō"]],
            "排场": [["rong2"], ["chang3"]],
            "难进易退": [["nan2"], ["jin4"], ["yi4"], ["tui4"]],
            "哭丧": [["ku1"], ["sang4"]],

        }
        load_phrases_dict(change_dict, )

        self.pinyin = lambda word:pinyin(word,Style.TONE3,errors="ignore")

        self.load()
Esempio n. 4
0
def test_custom_pinyin_dict2():
    hans = ['同行']
    try:
        assert lazy_pinyin(hans, style=TONE2) == ['to2ng', 'ha2ng']
    except AssertionError:
        pass
    load_phrases_dict({'同行': [['tóng'], ['xíng']]})
    assert lazy_pinyin(hans, style=TONE2) == ['to2ng', 'xi2ng']
Esempio n. 5
0
def register_pinyin():
    """
    register pypinyin for some special character
    """
    single_dict = {ord('哪'): 'na'}
    phrases_dict = {'哪些': [['na'], ['xie']]}
    load_single_dict(single_dict)
    load_phrases_dict(phrases_dict)
Esempio n. 6
0
def loadPhrasesDict():
    # qs-py.txt 是通过 qingsheng.py 生成的轻声词列表;需要增加或者修订轻声拼音,直接修改qs-py.txt即可
    with open('./pinyin-dict/qs-py.txt') as f:
        for line in f.readlines():
            hzPyList = line.split(':')
            hz = hzPyList[0]  # 眼睛
            py = [[p] for p in hzPyList[1].strip().split(' ')]
            load_phrases_dict({hz: py})
Esempio n. 7
0
def test_custom_pinyin_dict2():
    hans = ['同行']
    try:
        assert lazy_pinyin(hans, style=TONE2) == ['to2ng', 'ha2ng']
    except AssertionError:
        pass
    load_phrases_dict({'同行': [['tóng'], ['xíng']]})
    assert lazy_pinyin(hans, style=TONE2) == ['to2ng', 'xi2ng']
Esempio n. 8
0
 def __init__(self):
     """ 加载pypinyin的本地短语库,并将短语加入jieba
     """
     phrase = json.load(open(config.phrase_file, 'r'), encoding='utf-8')
     load_phrases_dict(phrase, style=u'default')
     for key in phrase.keys():
         jieba.add_word(key)
     ### Load comment dict
     #        self.comment = json.load(open(config.comment_file, 'r'), encoding='utf-8')
     self.load_comment()
     for key in self.comment.keys():
         jieba.add_word(key)
Esempio n. 9
0
def test_phrases():
    seg = mmseg.seg
    assert list(seg.cut('你要重新考虑这条建议')) == \
           ['你', '要', '重新', '考', '虑', '这', '条', '建', '议']
    load_phrases_dict({'在一起': [['zài'], ['yì'], ['qǐ']]})
    assert list(seg.cut('在一片')) == ['在', '一片']

    # 前缀匹配,后缀是词语
    #
    # 输入头部是另外一个词语的头部,会匹配其他词语的前缀
    # 输入尾部是一个词语
    # 此时这个尾部词语要被分词出来
    assert list(seg.cut('行业')) == ['行业']
    assert list(seg.cut('金融行业')) == ['金', '融', '行业']
    # 整个是词语
    assert list(seg.cut('金融寡头')) == ['金融寡头']
    assert list(seg.cut('服务行业')) == ['服务行业']
    assert list(seg.cut('人员')) == ['人员']
    assert list(seg.cut('服务人员')) == ['服务', '人员']

    assert list(seg.cut('银行')) == ['银行']
    assert list(seg.cut('浦发银行')) == ['浦', '发', '银行']

    assert list(seg.cut('')) == []
    # 整个匹配前缀,但是不是词语
    assert list(seg.cut('金')) == ['金']
    assert list(seg.cut('金融')) == ['金', '融']
    #
    assert list(seg.cut('金融金')) == ['金', '融', '金']
    assert list(seg.cut('金融金融')) == ['金', '融', '金', '融']
    assert list(seg.cut('金融金融金融金融金融金融')) == [
        '金', '融', '金', '融', '金', '融', '金', '融', '金', '融', '金', '融'
    ]
    assert list(seg.cut('金融金融金融金融金融金融金')) == [
        '金', '融', '金', '融', '金', '融', '金', '融', '金', '融', '金', '融', '金'
    ]

    # 没有任何匹配
    assert list(
        seg.cut('以其昏昏,使人昭昭')) == ['以', '其', '昏', '昏', ',', '使', '人', '昭', '昭']

    # 前缀无任何匹配, 后缀是词语
    assert list(seg.cut('以其昏昏行业')) == ['以', '其', '昏', '昏', '行业']

    # 前缀是词语
    assert list(seg.cut('行业以其昏昏')) == ['行业', '以', '其', '昏', '昏']

    # 中间是词语
    assert list(seg.cut('使人昭昭行业以其昏昏')) == [
        '使', '人', '昭', '昭', '行业', '以', '其', '昏', '昏'
    ]
Esempio n. 10
0
 def __init__(self):
     """ 加载pypinyin的本地短语库,并将短语加入jieba
     """
     phrase = json.load(open(config.data_path + '/shici_phrase.json', 'r'),
                        encoding='utf-8')
     load_phrases_dict(phrase, style=u'default')
     for key in phrase.keys():
         jieba.add_word(key)
     ### Load comment dict
     self.comment = json.load(open(config.data_path + '/shici_comment.json',
                                   'r'),
                              encoding='utf-8')
     for key in self.comment.keys():
         jieba.add_word(key)
Esempio n. 11
0
    def __init__(self):
        self.textReformer = text_reformer.TextReformer()

        self.pinyin_part = pinyin_utility.load_pinyin_part_dict()

        self.wordPart = word_part.WordPart()

        pypinyin.load_single_dict(pinyin_dict={0x55ef: u"en"})
        pypinyin.load_phrases_dict(
            phrases_dict={
                u'嗯哪': [[u'en'], [u'na']],
                u'人生何处不相逢': [[u'ren'], [u'sheng'], [u'he'], [u'chu'], [u'bu'],
                             [u'xiang'], [u'feng']]
            })
Esempio n. 12
0
    def init_text_to_vocab(self):
        pypinyin.load_phrases_dict({'调大': [['tiáo'], ['dà']],
                                    '调小': [['tiáo'], ['xiǎo']],
                                    '调亮': [['tiáo'], ['liàng']],
                                    '调暗': [['tiáo'], ['àn']],
                                    '肖': [['xiāo']],
                                    '英雄传': [['yīng'], ['xióng'], ['zhuàn']],
                                    '新传': [['xīn'], ['zhuàn']],
                                    '外传': [['wài'], ['zhuàn']],
                                    '正传': [['zhèng'], ['zhuàn']], '水浒传': [['shuǐ'], ['hǔ'], ['zhuàn']]
                                    })

        def text_to_vocab_func(txt):
            return pypinyin.lazy_pinyin(txt, 1, errors='ignore')

        self.text_to_vocab = text_to_vocab_func
Esempio n. 13
0
    def init_text_to_vocab(self):
        pypinyin.load_phrases_dict({'调大': [['tiáo'], ['dà']],
                                    '调小': [['tiáo'], ['xiǎo']],
                                    '调亮': [['tiáo'], ['liàng']],
                                    '调暗': [['tiáo'], ['àn']],
                                    '肖': [['xiāo']],
                                    '英雄传': [['yīng'], ['xióng'], ['zhuàn']],
                                    '新传': [['xīn'], ['zhuàn']],
                                    '外传': [['wài'], ['zhuàn']],
                                    '正传': [['zhèng'], ['zhuàn']], '水浒传': [['shuǐ'], ['hǔ'], ['zhuàn']]
                                    })

        def text_to_vocab_func(txt):
            pins=pypinyin.pinyin(txt)
            pins=[i[0] for i in pins]
            return pins

        self.text_to_vocab = text_to_vocab_func
Esempio n. 14
0
    def init_text_to_vocab(self):
        pypinyin.load_phrases_dict({'调大': [['tiáo'], ['dà']],
                                    '调小': [['tiáo'], ['xiǎo']],
                                    '调亮': [['tiáo'], ['liàng']],
                                    '调暗': [['tiáo'], ['àn']],
                                    '肖': [['xiāo']],
                                    '英雄传': [['yīng'], ['xióng'], ['zhuàn']],
                                    '新传': [['xīn'], ['zhuàn']],
                                    '外传': [['wài'], ['zhuàn']],
                                    '正传': [['zhèng'], ['zhuàn']], '水浒传': [['shuǐ'], ['hǔ'], ['zhuàn']]
                                    })

        def text_to_vocab_func(txt):
            if self.for_multi_task:
                pys = pypinyin.pinyin(txt, 8, neutral_tone_with_five=True)
                pys = [i[0] for i in pys]
                return pys
            else:
                pys=pypinyin.pinyin(txt)
                pys=[i[0] for i in pys]
                return pys

        self.text_to_vocab = text_to_vocab_func
Esempio n. 15
0
def _pre_pinyin_setting():
    ''' fix pinyin error'''
    load_phrases_dict({'嗯': [['ēn']]})
    load_phrases_dict({'风云变幻': [['fēng'], ['yún'], ['bià'], ['huàn']]})
    load_phrases_dict({'不破不立': [['bù'], ['pò'], ['bù'], ['lì']]})
Esempio n. 16
0
def comp_freq3_pinyin(datatype, s=True, e=True):
    PY.load_phrases_dict({
        '那些': [['nà'], ['xiē']],
        '哪些': [['nǎ'], ['xiē']],
        '哪个': [['nǎ'], ['gè']]
    })

    # initialize the dictionary
    freq3 = {'E_E': {}} if e else {}
    for char in char_pinyin.keys():
        for pinyin in char_pinyin[char]:
            freq3[char + '_' + pinyin] = {}

    # load the language material
    with open("../../resources/" + datatype + "_list.json") as in_file:
        data_list = json.load(in_file)
        in_file.close()

    # count the conditional frequency
    for data_str in data_list:
        data_len = len(data_str)
        # mark pinyin
        data_pinyin_list = []  # the list of pinyin of data_str
        data_pinyin = PY.lazy_pinyin(data_str)
        for i in range(0, data_len):
            if data_str[i] not in char_pinyin:
                data_pinyin[i] = '-'
            elif data_pinyin[i] not in char_pinyin[data_str[i]]:
                data_pinyin[i] = random.choice(char_pinyin[data_str[i]])
            data_pinyin_list.append(data_str[i] + '_' + data_pinyin[i])

        # count frequency
        if data_len > 1:
            # starting empty character
            if s and data_pinyin_list[0][-1] != '-' and data_pinyin_list[1][
                    -1] != '-':
                prev_str_pinyin = ('S_S ' + data_pinyin_list[0])
                if prev_str_pinyin not in freq3[data_pinyin_list[1]]:
                    freq3[data_pinyin_list[1]][prev_str_pinyin] = 1
                else:
                    freq3[data_pinyin_list[1]][prev_str_pinyin] += 1

            # normal Chinese-pinyin pairs
            i = 2
            while i < data_len:
                if data_pinyin_list[i][-1] == '-':
                    i += 3
                elif data_pinyin_list[i - 1][-1] == '-':
                    i += 2
                elif data_pinyin_list[i - 2][-1] == '-':
                    i += 1
                else:
                    prev_str_pinyin = data_pinyin_list[
                        i - 2] + ' ' + data_pinyin_list[i - 1]
                    if prev_str_pinyin not in freq3[data_pinyin_list[i]]:
                        freq3[data_pinyin_list[i]][prev_str_pinyin] = 1
                    else:
                        freq3[data_pinyin_list[i]][prev_str_pinyin] += 1
                    i += 1

            # ending empty character
            if e and i == data_len:  # i == data_len guarantees the last 2 characters are available
                prev_str_pinyin = data_pinyin_list[
                    i - 2] + ' ' + data_pinyin_list[i - 1]
                if prev_str_pinyin not in freq3[('E_E')]:
                    freq3[('E_E')][prev_str_pinyin] = 1
                else:
                    freq3[('E_E')][prev_str_pinyin] += 1

    # count the frequency of each conditional string
    freq2 = {}
    for i in freq3:
        for j in freq3[i]:
            if j in freq2:
                freq2[j] += freq3[i][j]
            else:
                freq2[j] = freq3[i][j]

    # convert frequency to relative frequency
    for i in freq3:
        for j in freq3[i]:
            freq3[i][j] /= freq2[j]

    # save the result
    try:
        out_file = open("../../statistics/freq3_pinyin_sentence_S_E.json", 'w')
        json.dump(freq3, out_file)
        out_file.close()
    finally:
        return freq3
Esempio n. 17
0
# -*- coding: UTF-8 -*-

from pypinyin import lazy_pinyin, load_phrases_dict

load_phrases_dict({'覃': [['Qin']]})


def get_pinying(string: str):
    return ''.join([w.title() for w in lazy_pinyin(string)])
Esempio n. 18
0
def test_no_non_phrases():
    seg = mmseg.seg
    assert list(seg.cut('你要重新考虑这条建议')) == \
           ['你', '要', '重新', '考', '虑', '这', '条', '建', '议']
    load_phrases_dict({'在一起': [['zài'], ['yì'], ['qǐ']]})
    assert list(seg.cut('在一片')) == ['在', '一', '片']
Esempio n. 19
0
def _pre_pinyin_setting():
    ''' fix pinyin error'''
    load_phrases_dict({'嗯':[['ēn']]})
Esempio n. 20
0
def active():
    """
    激活自定义短语多音字。
    """
    load_phrases_dict(PHRASES_DICTIONARY)
Esempio n. 21
0
def pypinyin_fix():
    pypinyin.load_phrases_dict({"哪些": [["na"], ["xie"]]})
    pypinyin.load_phrases_dict({"哪个": [["na"], ["ge"]]})
    pypinyin.load_phrases_dict({"那些": [["na"], ["xie"]]})
    pypinyin.load_phrases_dict({"白干": [["bai"], ["gan"]]})
    pypinyin.load_phrases_dict({"寻思": [["xun"], ["si"]]})
    pypinyin.load_phrases_dict({"清寒": [["qing"], ["han"]]})
    pypinyin.load_phrases_dict({"补齐": [["bu"], ["qi"]]})
    pypinyin.load_phrases_dict({"添砖加瓦": [["tian"], ["zhuan"], ["jia"], ["wa"]]})
    pypinyin.load_phrases_dict({"敬业乐群": [["jing"], ["ye"], ["le"], ["qun"]]})
    pypinyin.load_phrases_dict({"物竞天择": [["wu"], ["jing"], ["tian"], ["ze"]]})
    pypinyin.load_phrases_dict({"心存疑虑": [["xin"], ["cun"], ["yi"], ["lv"]]})
    pypinyin.load_phrases_dict({"避免麻烦": [["bi"], ["mian"], ["ma"], ["fan"]]})
    pypinyin.load_phrases_dict({"叶落归根": [["ye"], ["luo"], ["gui"], ["gen"]]})
    pypinyin.load_phrases_dict({"地动山摇": [["di"], ["dong"], ["shan"], ["yao"]]})
    pypinyin.load_single_dict({ord("帧"): "zhen"})
    pypinyin.load_single_dict({ord("霰"): "xian"})
    pypinyin.load_single_dict({ord("珩"): "heng"})
    pypinyin.load_single_dict({ord("嗯"): "en"})
    pypinyin.load_single_dict({ord("嗲"): "dia"})
    pypinyin.load_single_dict({ord("豉"): "chi"})
    pypinyin.load_single_dict({ord("聒"): "guo"})
Esempio n. 22
0
def test_custom_pinyin_dict2_tone2():
    load_phrases_dict({'同行': [['to4ng'], ['ku1']]}, style='tone2')
    assert lazy_pinyin(['同行'], style=TONE2) == ['to4ng', 'ku1']
    assert pinyin(['同行']) == [['tòng'], ['kū']]
Esempio n. 23
0
                 "I": "ai ", "J": "jei ", "K": "kei ", "L": "el ", "M": "em ", "N": "en ",
                 "O": "eo ", "P": "pii ", "Q": "kiu ", "R": "aa ", "S": "es ", "T": "tii ", "U": "iu ", "V": "vii ",
                 "W": "dabliu ", "X": "eiks ", "Y": "wai ", "Z": "zii "}


# PUNCTUATION2 = r'“”()×"\'()*#'  # 其它符号
# load_phrases_dict({u'360': [[u'jú'], [u'zǐ']]})

def json_load():
    with open("user_dict/fault-tolerant_word.json", "r") as rf:
        data = json.load(rf)
    return data


usr_phrase = json_load()
load_phrases_dict(usr_phrase)


def text2pinyin(syllables):
    temp = []
    for syllable in syllables:
        for p in PUNCTUATION:
            syllable = syllable.replace(p, "")
        # print(syllable)
        # if syllable.isdigit():
        try:
            syllable = atc.num2chinese(syllable)
            # print("sy:", syllable)
            new_sounds = lazy_pinyin(syllable, style=pypinyin.TONE2)
            print("pinyin:" + str(new_sounds))
            for e in new_sounds:
Esempio n. 24
0
    '呒',
    '𬉼',
    '帧',  # pypinyin bug
    '豉',  # pypinyin bug
    '𬣙',  # pypinyin bug
    '𬇕',  # pypinyin bug
    '𬣞',  # pypinyin bug
    '𬘓',  # pypinyin bug
    '𫭟',  # pypinyin bug
    '𫭢',  # pypinyin bug
    '𫵷',
    '𬇙',
    '𬣡',
]  # checked until 6629

load_phrases_dict(pypinyin_correction)


def vocal_encode_pinyin(initial, final):
    if len(initial) != 0 or len(final) > 2:
        for each, target in flypy_encoding_table_final.items():
            final = final.replace(each, target)

        for each, target in flypy_encoding_table_initial.items():
            initial = initial.replace(each, target)

    return initial + final


def vocal_encode(item):
    initials = pinyin(item, style=Style.INITIALS, strict=False, heteronym=True)
Esempio n. 25
0
from pypinyin import lazy_pinyin, load_phrases_dict

print(lazy_pinyin('朝阳'))
personalized_dict = {'朝阳': [['cháo'], ['yáng']]}
load_phrases_dict(personalized_dict)
print(lazy_pinyin('朝阳'))
Esempio n. 26
0
import re
from pypinyin import pinyin, load_phrases_dict
from add_new_word_dict import new_word_pinyin_dict, new_word_split_dict#, new_word_pianpang_dict
import os
from sklearn.externals import joblib
import random
import jieba
import jieba.posseg as pseg
jieba.load_userdict(['左 f', '口 n', '日 n','月 n', '年 n','一起 s','米 n'])
jieba.suggest_freq(('八', '下'), True)  # 八下
jieba.suggest_freq(('一个','包'), True)
load_phrases_dict(new_word_pinyin_dict)


dictionary_path = os.path.split(os.path.realpath(__file__))[0] + '\\dict\\'
word_component_dict, word_radical_dict, zi_radical_dict = joblib.load(dictionary_path+'chaizi_dict')

word_component_dict.update(new_word_split_dict)
# word_radical_dict.update(new_word_pianpang_dict)


def find_num(input):
    result = re.findall('[一二两俩三四五六]|[0-9]',input)
    return result


number_dict = {
    '一':1,
    '二':2,
    '两':2,
    '俩':2,
Esempio n. 27
0
#coding=utf-8
import pypinyin
import word_part
import numpy as np
import pinyin_utility
pypinyin.load_single_dict(pinyin_dict={0x55ef: u"en"})
pypinyin.load_phrases_dict(
    phrases_dict={
        u'嗯哪': [[u'en'], [u'na']],
        u'人生何处不相逢': [[u'ren'], [u'sheng'], [u'he'], [u'chu'], [u'bu'],
                     [u'xiang'], [u'feng']]
    })
pinyin_part_dict = pinyin_utility.load_pinyin_part_dict()
wp = word_part.WordPart()


def generate_vocab_vec():
    raw_vocab_list, raw_vocab, size1 = load_w2v('../../model/raw.txt')
    pinyin_vocab_list, pinyin_vocab, size2 = load_w2v('../../model/pinyin.txt')
    part_vocab_list, part_vocab, size3 = load_w2v('../../model/part.txt')
    fw1 = open('../../model/vocab.txt', 'w')
    fw2 = open('../../model/vec.txt', 'w')
    size = size1 + size2 + size3
    fw2.write(str(len(raw_vocab_list)) + ' ' + str(size) + '\n')
    fw1.write('UNK\n')
    unk_list = [0] * size
    unk = np.asarray(unk_list, dtype=np.float32).tolist()
    fw2.write(' '.join([str(i) for i in unk]) + '\n')
    for word in raw_vocab:
        word_pinyin = pypinyin.lazy_pinyin(word, errors=lambda x: u'ng')
        try:
Esempio n. 28
0
def load_luna_dict():
    single_dict = {}
    phrases_dict = {}
    # 朙月拼音是专为繁体字设计的字典, 里面的简体字被看成"被大陸簡化字借用字形的傳承字"标注的是"古音"
    # 直接用来处理带简体字的zhwiki效果惨不忍睹(-_-#), 这里使用opencc尝试规避该问题
    luna_dict = {}
    luna_dict_simple = {}
    with open('./rime-luna-pinyin/luna_pinyin.dict.yaml', mode='r') as f:
        for line in f:
            match = PATTERN_RIME_DICT_ITEM.match(line)
            if match:
                item = Dict(match.groupdict())
                # item中的words字段进用来debug时追踪item的来源
                word = item['word']
                item.pop('word')
                item.words = word
                item.percent = float(item.percent) if item.percent is not None else 100

                if luna_dict.get(word) is None:
                    luna_dict[word] = [item]
                else:
                    # 多音字
                    luna_dict[word].append(item)

                word_simple = _TO_SIMPLIFIED_CHINESE.convert(word)
                if word != word_simple:
                    item_simple = Dict(item)
                    if luna_dict_simple.get(word_simple) is None:
                        luna_dict_simple[word_simple] = [item_simple]
                    else:
                        # 多繁转一简后同音的情况, 此时应该将词频累加
                        for exist_item in luna_dict_simple[word_simple]:
                            if exist_item.pinyin == item_simple.pinyin:
                                exist_item.percent += item_simple.percent
                                exist_item.words += item_simple.words
                                # logging.info(f'exist_item: {exist_item}')
                                break
                        else:
                            luna_dict_simple[word_simple].append(item_simple)
    # 使用简体字的注音覆盖繁体字的注音, 则那些"被大陸簡化字借用字形的傳承字"的注音大多会被覆盖掉...
    luna_dict.update(luna_dict_simple)
    for (word, items) in luna_dict.items():
        for item in items:
            if item.percent < 5:
                # 排除低频词
                continue
            if len(word) == 1:
                codePoint = ord(word)
                if single_dict.get(codePoint) is None:
                    single_dict[codePoint] = item.pinyin
                else:
                    single_dict[codePoint] = f'{single_dict[codePoint]},{item.pinyin}'
            else:
                w = item.pinyin.split(' ')
                if phrases_dict.get(word) is None:
                    phrases_dict[word] = [[it] for it in w]
                elif len(phrases_dict[word]) == len(w):
                    for i in range(len(w)):
                        phrases_dict[word][i].append(w[i])
                else:
                    logging.warn(f'invalid pinyin: {word} -> {item}')

    # 移除内置单字词典的多音字
    for (word, pinyins) in PINYIN_DICT.items():
        pinyin_list = pinyins.split(',')
        if len(pinyin_list) > 1:
            PINYIN_DICT[word] = pinyin_list[0]
    # 移除内置词组的多音字
    for (word, phrases) in PHRASES_DICT.items():
        for p in phrases:
            while(len(p) > 1):
                p.pop()
    # 加载luna词典
    load_single_dict(single_dict)
    load_phrases_dict(phrases_dict)
Esempio n. 29
0
def _pre_pinyin_setting():
    ''' fix pinyin error'''
    load_phrases_dict({'嗯':[['ēn']]})
    load_phrases_dict({'不破不立':[['bù'], ['pò'], ['bù'], ['lì']]})
Esempio n. 30
0
import re

from pypinyin import pinyin, lazy_pinyin, load_phrases_dict

if __name__ == '__main__':
    print(f'{pinyin("中心")}')
    print(f'{lazy_pinyin("樊昌学")}')
    print(f'{lazy_pinyin("音乐")}')
    print(f'{lazy_pinyin("啊奥哦额")}')
    load_phrases_dict({'蹒跚': [['pen'], ['shan']], '你好': [['hi'], ['ya']]})
    print(f'{lazy_pinyin("你好蹒跚好")}')

    listdate = re.split(r'[年月日号]', '2019年4月18')
    if listdate[-1] == '':
        listdate.pop()
    print('-'.join(listdate))

    date_pattern = re.compile(r'((\d+)年)?(\d+)月(\d+)([日号])?')

    def year_repl(matchobj):
        if matchobj.group(1):
            return f'{matchobj.group(2)}-{matchobj.group(3)}-{matchobj.group(4)}'
        else:
            from datetime import datetime
            return f'{datetime.today().year}-{matchobj.group(3)}-{matchobj.group(4)}'

    print(date_pattern.sub(year_repl, '4月18'))
    print(date_pattern.sub(year_repl, '4月18'))
    print(date_pattern.sub(year_repl, '2018年4月18号'))

    def process_carid(carid):
Esempio n. 31
0
def test_custom_pinyin_dict2_tone2():
    load_phrases_dict({'同行': [['to4ng'], ['ku1']]}, style='tone2')
    assert lazy_pinyin(['同行'], style=TONE2) == ['to4ng', 'ku1']
    assert pinyin('同行') == [['tòng'], ['kū']]
使用其他分词模块:

安装分词模块,比如 pip install snownlp ;

使用经过分词处理的 字符串列表 作参数:

>> from pypinyin import lazy_pinyin, TONE2
>> from snownlp import SnowNLP
>> hans = u'音乐123'
>> hans_seg = SnowNLP(hans).words  # 分词处理
>> hans_seg
[u'\u97f3\u4e50', u'123']
>> lazy_pinyin(hans_seg, style=TONE2)
[u'yi1n', u'yue4', u'123']
自定义拼音库
如果对结果不满意,可以通过 load_single_dict() 或 load_phrases_dict() 以自定义拼音库的方式修正结果:

安装了 jieba 分词模块并且支持分词的词组

>> from pypinyin import lazy_pinyin, load_phrases_dict, TONE2
>> hans = u'桔子'
>> lazy_pinyin(hans, style=TONE2)
[u'jie2', u'zi3']
>> load_phrases_dict({u'桔子': [[u'jú'], [u'zǐ']]})
>> lazy_pinyin(hans, style=TONE2)
[u'ju2', u'zi3']
未安装 jieba 分词模块 and/or 不支持分词的词组

>> from pypinyin import lazy_pinyin, load_phrases_dict, TONE2, load_single_dict
>> hans = u'还没'
>> lazy_pinyin(hans, style=TONE2)