def tag(self): while True: tags=[] intag=input('请输入要爬取的标签,多个标签请用&&分割: ') if intag.lower()=='q':sys.exit() if ('&&' in intag)==True:tags=intag.split('&&') else:tags.append(intag) empty=0 for i in range(len(tags)): print('<{}/{}>正在解析标签[{}]'.format(str(i+1),str(len(tags)),tags[i])) load_phrases_dict(pydict) tags[i]=''.join(lazy_pinyin(tags[i])) url=self.host+'/t/'+tags[i]+'/' html=httpget(url) if html ==None: empty+=1 print('标签[{}]不存在!'.format(tags[i])) else: ehtml=etree.HTML(html) page=ehtml.xpath("//div[@id='pages']/a[last()-1]/text()")[0] result=ehtml.xpath("//ul[@class='img']/li") if page!='1': p=get_input(int(page),'请输入要解析的页数[总页数{}]: '.format(page)) if p!=1: pbar=tqdm.tqdm(range(2,p+1),desc='解析进度',ncols=80) for i1 in pbar: result+=self.get_results(url+str(i1)+'.html') print('标签[{}]下共找到图集{}个'.format(tags[i],len(result))) else: print('标签[{}]下共找到图集{}个'.format(tags[i],len(result))) self.results+=result if empty==len(tags):print(colorama.Back.RED+'您输入的标签均不存在,请重试!') else: print(colorama.Fore.GREEN+'所有标签下共找到图集{}个'.format(len(self.results))) break
def test_retrain(): seg = mmseg.seg assert list(seg.cut('啊啊啊')) == ['啊', '啊', '啊'] load_phrases_dict({'啊啊啊': [['a'], ['a'], ['a']]}) mmseg.retrain(seg) assert list(seg.cut('啊啊啊')) == ['啊啊啊']
def __init__(self,pyfile = None,sil_mode = 0): ''' :param pyfile: :param use_pinyin: 是否使用pinyin 库注音 :param sil_mode: 决定静音音素的标识,0代表第一个,1代表最后一个,-1代表考虑ctc解码,结尾一个,-1 1个 ''' if pyfile is None: pyfile = config.py_dict_path self.py_file = pyfile self.max_index = None # 这个max_index 是包含了代码添加的blank后的max_index,num_py_map[max_index] = <blank> self.sil_mode = sil_mode self.use_pinyin = True #use_pinyin 该选项在之后会废弃,永久使用pinyin库注拼音 # 这里是发现的一些错误,暂时用硬编码解决了 change_dict = { "茸": [["rong2"]], "蓉": [["rong2"]], "嗯": [["en1"]], "哦": [["ō"]], "排场": [["rong2"], ["chang3"]], "难进易退": [["nan2"], ["jin4"], ["yi4"], ["tui4"]], "哭丧": [["ku1"], ["sang4"]], } load_phrases_dict(change_dict, ) self.pinyin = lambda word:pinyin(word,Style.TONE3,errors="ignore") self.load()
def test_custom_pinyin_dict2(): hans = ['同行'] try: assert lazy_pinyin(hans, style=TONE2) == ['to2ng', 'ha2ng'] except AssertionError: pass load_phrases_dict({'同行': [['tóng'], ['xíng']]}) assert lazy_pinyin(hans, style=TONE2) == ['to2ng', 'xi2ng']
def register_pinyin(): """ register pypinyin for some special character """ single_dict = {ord('哪'): 'na'} phrases_dict = {'哪些': [['na'], ['xie']]} load_single_dict(single_dict) load_phrases_dict(phrases_dict)
def loadPhrasesDict(): # qs-py.txt 是通过 qingsheng.py 生成的轻声词列表;需要增加或者修订轻声拼音,直接修改qs-py.txt即可 with open('./pinyin-dict/qs-py.txt') as f: for line in f.readlines(): hzPyList = line.split(':') hz = hzPyList[0] # 眼睛 py = [[p] for p in hzPyList[1].strip().split(' ')] load_phrases_dict({hz: py})
def __init__(self): """ 加载pypinyin的本地短语库,并将短语加入jieba """ phrase = json.load(open(config.phrase_file, 'r'), encoding='utf-8') load_phrases_dict(phrase, style=u'default') for key in phrase.keys(): jieba.add_word(key) ### Load comment dict # self.comment = json.load(open(config.comment_file, 'r'), encoding='utf-8') self.load_comment() for key in self.comment.keys(): jieba.add_word(key)
def test_phrases(): seg = mmseg.seg assert list(seg.cut('你要重新考虑这条建议')) == \ ['你', '要', '重新', '考', '虑', '这', '条', '建', '议'] load_phrases_dict({'在一起': [['zài'], ['yì'], ['qǐ']]}) assert list(seg.cut('在一片')) == ['在', '一片'] # 前缀匹配,后缀是词语 # # 输入头部是另外一个词语的头部,会匹配其他词语的前缀 # 输入尾部是一个词语 # 此时这个尾部词语要被分词出来 assert list(seg.cut('行业')) == ['行业'] assert list(seg.cut('金融行业')) == ['金', '融', '行业'] # 整个是词语 assert list(seg.cut('金融寡头')) == ['金融寡头'] assert list(seg.cut('服务行业')) == ['服务行业'] assert list(seg.cut('人员')) == ['人员'] assert list(seg.cut('服务人员')) == ['服务', '人员'] assert list(seg.cut('银行')) == ['银行'] assert list(seg.cut('浦发银行')) == ['浦', '发', '银行'] assert list(seg.cut('')) == [] # 整个匹配前缀,但是不是词语 assert list(seg.cut('金')) == ['金'] assert list(seg.cut('金融')) == ['金', '融'] # assert list(seg.cut('金融金')) == ['金', '融', '金'] assert list(seg.cut('金融金融')) == ['金', '融', '金', '融'] assert list(seg.cut('金融金融金融金融金融金融')) == [ '金', '融', '金', '融', '金', '融', '金', '融', '金', '融', '金', '融' ] assert list(seg.cut('金融金融金融金融金融金融金')) == [ '金', '融', '金', '融', '金', '融', '金', '融', '金', '融', '金', '融', '金' ] # 没有任何匹配 assert list( seg.cut('以其昏昏,使人昭昭')) == ['以', '其', '昏', '昏', ',', '使', '人', '昭', '昭'] # 前缀无任何匹配, 后缀是词语 assert list(seg.cut('以其昏昏行业')) == ['以', '其', '昏', '昏', '行业'] # 前缀是词语 assert list(seg.cut('行业以其昏昏')) == ['行业', '以', '其', '昏', '昏'] # 中间是词语 assert list(seg.cut('使人昭昭行业以其昏昏')) == [ '使', '人', '昭', '昭', '行业', '以', '其', '昏', '昏' ]
def __init__(self): """ 加载pypinyin的本地短语库,并将短语加入jieba """ phrase = json.load(open(config.data_path + '/shici_phrase.json', 'r'), encoding='utf-8') load_phrases_dict(phrase, style=u'default') for key in phrase.keys(): jieba.add_word(key) ### Load comment dict self.comment = json.load(open(config.data_path + '/shici_comment.json', 'r'), encoding='utf-8') for key in self.comment.keys(): jieba.add_word(key)
def __init__(self): self.textReformer = text_reformer.TextReformer() self.pinyin_part = pinyin_utility.load_pinyin_part_dict() self.wordPart = word_part.WordPart() pypinyin.load_single_dict(pinyin_dict={0x55ef: u"en"}) pypinyin.load_phrases_dict( phrases_dict={ u'嗯哪': [[u'en'], [u'na']], u'人生何处不相逢': [[u'ren'], [u'sheng'], [u'he'], [u'chu'], [u'bu'], [u'xiang'], [u'feng']] })
def init_text_to_vocab(self): pypinyin.load_phrases_dict({'调大': [['tiáo'], ['dà']], '调小': [['tiáo'], ['xiǎo']], '调亮': [['tiáo'], ['liàng']], '调暗': [['tiáo'], ['àn']], '肖': [['xiāo']], '英雄传': [['yīng'], ['xióng'], ['zhuàn']], '新传': [['xīn'], ['zhuàn']], '外传': [['wài'], ['zhuàn']], '正传': [['zhèng'], ['zhuàn']], '水浒传': [['shuǐ'], ['hǔ'], ['zhuàn']] }) def text_to_vocab_func(txt): return pypinyin.lazy_pinyin(txt, 1, errors='ignore') self.text_to_vocab = text_to_vocab_func
def init_text_to_vocab(self): pypinyin.load_phrases_dict({'调大': [['tiáo'], ['dà']], '调小': [['tiáo'], ['xiǎo']], '调亮': [['tiáo'], ['liàng']], '调暗': [['tiáo'], ['àn']], '肖': [['xiāo']], '英雄传': [['yīng'], ['xióng'], ['zhuàn']], '新传': [['xīn'], ['zhuàn']], '外传': [['wài'], ['zhuàn']], '正传': [['zhèng'], ['zhuàn']], '水浒传': [['shuǐ'], ['hǔ'], ['zhuàn']] }) def text_to_vocab_func(txt): pins=pypinyin.pinyin(txt) pins=[i[0] for i in pins] return pins self.text_to_vocab = text_to_vocab_func
def init_text_to_vocab(self): pypinyin.load_phrases_dict({'调大': [['tiáo'], ['dà']], '调小': [['tiáo'], ['xiǎo']], '调亮': [['tiáo'], ['liàng']], '调暗': [['tiáo'], ['àn']], '肖': [['xiāo']], '英雄传': [['yīng'], ['xióng'], ['zhuàn']], '新传': [['xīn'], ['zhuàn']], '外传': [['wài'], ['zhuàn']], '正传': [['zhèng'], ['zhuàn']], '水浒传': [['shuǐ'], ['hǔ'], ['zhuàn']] }) def text_to_vocab_func(txt): if self.for_multi_task: pys = pypinyin.pinyin(txt, 8, neutral_tone_with_five=True) pys = [i[0] for i in pys] return pys else: pys=pypinyin.pinyin(txt) pys=[i[0] for i in pys] return pys self.text_to_vocab = text_to_vocab_func
def _pre_pinyin_setting(): ''' fix pinyin error''' load_phrases_dict({'嗯': [['ēn']]}) load_phrases_dict({'风云变幻': [['fēng'], ['yún'], ['bià'], ['huàn']]}) load_phrases_dict({'不破不立': [['bù'], ['pò'], ['bù'], ['lì']]})
def comp_freq3_pinyin(datatype, s=True, e=True): PY.load_phrases_dict({ '那些': [['nà'], ['xiē']], '哪些': [['nǎ'], ['xiē']], '哪个': [['nǎ'], ['gè']] }) # initialize the dictionary freq3 = {'E_E': {}} if e else {} for char in char_pinyin.keys(): for pinyin in char_pinyin[char]: freq3[char + '_' + pinyin] = {} # load the language material with open("../../resources/" + datatype + "_list.json") as in_file: data_list = json.load(in_file) in_file.close() # count the conditional frequency for data_str in data_list: data_len = len(data_str) # mark pinyin data_pinyin_list = [] # the list of pinyin of data_str data_pinyin = PY.lazy_pinyin(data_str) for i in range(0, data_len): if data_str[i] not in char_pinyin: data_pinyin[i] = '-' elif data_pinyin[i] not in char_pinyin[data_str[i]]: data_pinyin[i] = random.choice(char_pinyin[data_str[i]]) data_pinyin_list.append(data_str[i] + '_' + data_pinyin[i]) # count frequency if data_len > 1: # starting empty character if s and data_pinyin_list[0][-1] != '-' and data_pinyin_list[1][ -1] != '-': prev_str_pinyin = ('S_S ' + data_pinyin_list[0]) if prev_str_pinyin not in freq3[data_pinyin_list[1]]: freq3[data_pinyin_list[1]][prev_str_pinyin] = 1 else: freq3[data_pinyin_list[1]][prev_str_pinyin] += 1 # normal Chinese-pinyin pairs i = 2 while i < data_len: if data_pinyin_list[i][-1] == '-': i += 3 elif data_pinyin_list[i - 1][-1] == '-': i += 2 elif data_pinyin_list[i - 2][-1] == '-': i += 1 else: prev_str_pinyin = data_pinyin_list[ i - 2] + ' ' + data_pinyin_list[i - 1] if prev_str_pinyin not in freq3[data_pinyin_list[i]]: freq3[data_pinyin_list[i]][prev_str_pinyin] = 1 else: freq3[data_pinyin_list[i]][prev_str_pinyin] += 1 i += 1 # ending empty character if e and i == data_len: # i == data_len guarantees the last 2 characters are available prev_str_pinyin = data_pinyin_list[ i - 2] + ' ' + data_pinyin_list[i - 1] if prev_str_pinyin not in freq3[('E_E')]: freq3[('E_E')][prev_str_pinyin] = 1 else: freq3[('E_E')][prev_str_pinyin] += 1 # count the frequency of each conditional string freq2 = {} for i in freq3: for j in freq3[i]: if j in freq2: freq2[j] += freq3[i][j] else: freq2[j] = freq3[i][j] # convert frequency to relative frequency for i in freq3: for j in freq3[i]: freq3[i][j] /= freq2[j] # save the result try: out_file = open("../../statistics/freq3_pinyin_sentence_S_E.json", 'w') json.dump(freq3, out_file) out_file.close() finally: return freq3
# -*- coding: UTF-8 -*- from pypinyin import lazy_pinyin, load_phrases_dict load_phrases_dict({'覃': [['Qin']]}) def get_pinying(string: str): return ''.join([w.title() for w in lazy_pinyin(string)])
def test_no_non_phrases(): seg = mmseg.seg assert list(seg.cut('你要重新考虑这条建议')) == \ ['你', '要', '重新', '考', '虑', '这', '条', '建', '议'] load_phrases_dict({'在一起': [['zài'], ['yì'], ['qǐ']]}) assert list(seg.cut('在一片')) == ['在', '一', '片']
def _pre_pinyin_setting(): ''' fix pinyin error''' load_phrases_dict({'嗯':[['ēn']]})
def active(): """ 激活自定义短语多音字。 """ load_phrases_dict(PHRASES_DICTIONARY)
def pypinyin_fix(): pypinyin.load_phrases_dict({"哪些": [["na"], ["xie"]]}) pypinyin.load_phrases_dict({"哪个": [["na"], ["ge"]]}) pypinyin.load_phrases_dict({"那些": [["na"], ["xie"]]}) pypinyin.load_phrases_dict({"白干": [["bai"], ["gan"]]}) pypinyin.load_phrases_dict({"寻思": [["xun"], ["si"]]}) pypinyin.load_phrases_dict({"清寒": [["qing"], ["han"]]}) pypinyin.load_phrases_dict({"补齐": [["bu"], ["qi"]]}) pypinyin.load_phrases_dict({"添砖加瓦": [["tian"], ["zhuan"], ["jia"], ["wa"]]}) pypinyin.load_phrases_dict({"敬业乐群": [["jing"], ["ye"], ["le"], ["qun"]]}) pypinyin.load_phrases_dict({"物竞天择": [["wu"], ["jing"], ["tian"], ["ze"]]}) pypinyin.load_phrases_dict({"心存疑虑": [["xin"], ["cun"], ["yi"], ["lv"]]}) pypinyin.load_phrases_dict({"避免麻烦": [["bi"], ["mian"], ["ma"], ["fan"]]}) pypinyin.load_phrases_dict({"叶落归根": [["ye"], ["luo"], ["gui"], ["gen"]]}) pypinyin.load_phrases_dict({"地动山摇": [["di"], ["dong"], ["shan"], ["yao"]]}) pypinyin.load_single_dict({ord("帧"): "zhen"}) pypinyin.load_single_dict({ord("霰"): "xian"}) pypinyin.load_single_dict({ord("珩"): "heng"}) pypinyin.load_single_dict({ord("嗯"): "en"}) pypinyin.load_single_dict({ord("嗲"): "dia"}) pypinyin.load_single_dict({ord("豉"): "chi"}) pypinyin.load_single_dict({ord("聒"): "guo"})
def test_custom_pinyin_dict2_tone2(): load_phrases_dict({'同行': [['to4ng'], ['ku1']]}, style='tone2') assert lazy_pinyin(['同行'], style=TONE2) == ['to4ng', 'ku1'] assert pinyin(['同行']) == [['tòng'], ['kū']]
"I": "ai ", "J": "jei ", "K": "kei ", "L": "el ", "M": "em ", "N": "en ", "O": "eo ", "P": "pii ", "Q": "kiu ", "R": "aa ", "S": "es ", "T": "tii ", "U": "iu ", "V": "vii ", "W": "dabliu ", "X": "eiks ", "Y": "wai ", "Z": "zii "} # PUNCTUATION2 = r'“”()×"\'()*#' # 其它符号 # load_phrases_dict({u'360': [[u'jú'], [u'zǐ']]}) def json_load(): with open("user_dict/fault-tolerant_word.json", "r") as rf: data = json.load(rf) return data usr_phrase = json_load() load_phrases_dict(usr_phrase) def text2pinyin(syllables): temp = [] for syllable in syllables: for p in PUNCTUATION: syllable = syllable.replace(p, "") # print(syllable) # if syllable.isdigit(): try: syllable = atc.num2chinese(syllable) # print("sy:", syllable) new_sounds = lazy_pinyin(syllable, style=pypinyin.TONE2) print("pinyin:" + str(new_sounds)) for e in new_sounds:
'呒', '𬉼', '帧', # pypinyin bug '豉', # pypinyin bug '𬣙', # pypinyin bug '𬇕', # pypinyin bug '𬣞', # pypinyin bug '𬘓', # pypinyin bug '𫭟', # pypinyin bug '𫭢', # pypinyin bug '𫵷', '𬇙', '𬣡', ] # checked until 6629 load_phrases_dict(pypinyin_correction) def vocal_encode_pinyin(initial, final): if len(initial) != 0 or len(final) > 2: for each, target in flypy_encoding_table_final.items(): final = final.replace(each, target) for each, target in flypy_encoding_table_initial.items(): initial = initial.replace(each, target) return initial + final def vocal_encode(item): initials = pinyin(item, style=Style.INITIALS, strict=False, heteronym=True)
from pypinyin import lazy_pinyin, load_phrases_dict print(lazy_pinyin('朝阳')) personalized_dict = {'朝阳': [['cháo'], ['yáng']]} load_phrases_dict(personalized_dict) print(lazy_pinyin('朝阳'))
import re from pypinyin import pinyin, load_phrases_dict from add_new_word_dict import new_word_pinyin_dict, new_word_split_dict#, new_word_pianpang_dict import os from sklearn.externals import joblib import random import jieba import jieba.posseg as pseg jieba.load_userdict(['左 f', '口 n', '日 n','月 n', '年 n','一起 s','米 n']) jieba.suggest_freq(('八', '下'), True) # 八下 jieba.suggest_freq(('一个','包'), True) load_phrases_dict(new_word_pinyin_dict) dictionary_path = os.path.split(os.path.realpath(__file__))[0] + '\\dict\\' word_component_dict, word_radical_dict, zi_radical_dict = joblib.load(dictionary_path+'chaizi_dict') word_component_dict.update(new_word_split_dict) # word_radical_dict.update(new_word_pianpang_dict) def find_num(input): result = re.findall('[一二两俩三四五六]|[0-9]',input) return result number_dict = { '一':1, '二':2, '两':2, '俩':2,
#coding=utf-8 import pypinyin import word_part import numpy as np import pinyin_utility pypinyin.load_single_dict(pinyin_dict={0x55ef: u"en"}) pypinyin.load_phrases_dict( phrases_dict={ u'嗯哪': [[u'en'], [u'na']], u'人生何处不相逢': [[u'ren'], [u'sheng'], [u'he'], [u'chu'], [u'bu'], [u'xiang'], [u'feng']] }) pinyin_part_dict = pinyin_utility.load_pinyin_part_dict() wp = word_part.WordPart() def generate_vocab_vec(): raw_vocab_list, raw_vocab, size1 = load_w2v('../../model/raw.txt') pinyin_vocab_list, pinyin_vocab, size2 = load_w2v('../../model/pinyin.txt') part_vocab_list, part_vocab, size3 = load_w2v('../../model/part.txt') fw1 = open('../../model/vocab.txt', 'w') fw2 = open('../../model/vec.txt', 'w') size = size1 + size2 + size3 fw2.write(str(len(raw_vocab_list)) + ' ' + str(size) + '\n') fw1.write('UNK\n') unk_list = [0] * size unk = np.asarray(unk_list, dtype=np.float32).tolist() fw2.write(' '.join([str(i) for i in unk]) + '\n') for word in raw_vocab: word_pinyin = pypinyin.lazy_pinyin(word, errors=lambda x: u'ng') try:
def load_luna_dict(): single_dict = {} phrases_dict = {} # 朙月拼音是专为繁体字设计的字典, 里面的简体字被看成"被大陸簡化字借用字形的傳承字"标注的是"古音" # 直接用来处理带简体字的zhwiki效果惨不忍睹(-_-#), 这里使用opencc尝试规避该问题 luna_dict = {} luna_dict_simple = {} with open('./rime-luna-pinyin/luna_pinyin.dict.yaml', mode='r') as f: for line in f: match = PATTERN_RIME_DICT_ITEM.match(line) if match: item = Dict(match.groupdict()) # item中的words字段进用来debug时追踪item的来源 word = item['word'] item.pop('word') item.words = word item.percent = float(item.percent) if item.percent is not None else 100 if luna_dict.get(word) is None: luna_dict[word] = [item] else: # 多音字 luna_dict[word].append(item) word_simple = _TO_SIMPLIFIED_CHINESE.convert(word) if word != word_simple: item_simple = Dict(item) if luna_dict_simple.get(word_simple) is None: luna_dict_simple[word_simple] = [item_simple] else: # 多繁转一简后同音的情况, 此时应该将词频累加 for exist_item in luna_dict_simple[word_simple]: if exist_item.pinyin == item_simple.pinyin: exist_item.percent += item_simple.percent exist_item.words += item_simple.words # logging.info(f'exist_item: {exist_item}') break else: luna_dict_simple[word_simple].append(item_simple) # 使用简体字的注音覆盖繁体字的注音, 则那些"被大陸簡化字借用字形的傳承字"的注音大多会被覆盖掉... luna_dict.update(luna_dict_simple) for (word, items) in luna_dict.items(): for item in items: if item.percent < 5: # 排除低频词 continue if len(word) == 1: codePoint = ord(word) if single_dict.get(codePoint) is None: single_dict[codePoint] = item.pinyin else: single_dict[codePoint] = f'{single_dict[codePoint]},{item.pinyin}' else: w = item.pinyin.split(' ') if phrases_dict.get(word) is None: phrases_dict[word] = [[it] for it in w] elif len(phrases_dict[word]) == len(w): for i in range(len(w)): phrases_dict[word][i].append(w[i]) else: logging.warn(f'invalid pinyin: {word} -> {item}') # 移除内置单字词典的多音字 for (word, pinyins) in PINYIN_DICT.items(): pinyin_list = pinyins.split(',') if len(pinyin_list) > 1: PINYIN_DICT[word] = pinyin_list[0] # 移除内置词组的多音字 for (word, phrases) in PHRASES_DICT.items(): for p in phrases: while(len(p) > 1): p.pop() # 加载luna词典 load_single_dict(single_dict) load_phrases_dict(phrases_dict)
def _pre_pinyin_setting(): ''' fix pinyin error''' load_phrases_dict({'嗯':[['ēn']]}) load_phrases_dict({'不破不立':[['bù'], ['pò'], ['bù'], ['lì']]})
import re from pypinyin import pinyin, lazy_pinyin, load_phrases_dict if __name__ == '__main__': print(f'{pinyin("中心")}') print(f'{lazy_pinyin("樊昌学")}') print(f'{lazy_pinyin("音乐")}') print(f'{lazy_pinyin("啊奥哦额")}') load_phrases_dict({'蹒跚': [['pen'], ['shan']], '你好': [['hi'], ['ya']]}) print(f'{lazy_pinyin("你好蹒跚好")}') listdate = re.split(r'[年月日号]', '2019年4月18') if listdate[-1] == '': listdate.pop() print('-'.join(listdate)) date_pattern = re.compile(r'((\d+)年)?(\d+)月(\d+)([日号])?') def year_repl(matchobj): if matchobj.group(1): return f'{matchobj.group(2)}-{matchobj.group(3)}-{matchobj.group(4)}' else: from datetime import datetime return f'{datetime.today().year}-{matchobj.group(3)}-{matchobj.group(4)}' print(date_pattern.sub(year_repl, '4月18')) print(date_pattern.sub(year_repl, '4月18')) print(date_pattern.sub(year_repl, '2018年4月18号')) def process_carid(carid):
def test_custom_pinyin_dict2_tone2(): load_phrases_dict({'同行': [['to4ng'], ['ku1']]}, style='tone2') assert lazy_pinyin(['同行'], style=TONE2) == ['to4ng', 'ku1'] assert pinyin('同行') == [['tòng'], ['kū']]
使用其他分词模块: 安装分词模块,比如 pip install snownlp ; 使用经过分词处理的 字符串列表 作参数: >> from pypinyin import lazy_pinyin, TONE2 >> from snownlp import SnowNLP >> hans = u'音乐123' >> hans_seg = SnowNLP(hans).words # 分词处理 >> hans_seg [u'\u97f3\u4e50', u'123'] >> lazy_pinyin(hans_seg, style=TONE2) [u'yi1n', u'yue4', u'123'] 自定义拼音库 如果对结果不满意,可以通过 load_single_dict() 或 load_phrases_dict() 以自定义拼音库的方式修正结果: 安装了 jieba 分词模块并且支持分词的词组 >> from pypinyin import lazy_pinyin, load_phrases_dict, TONE2 >> hans = u'桔子' >> lazy_pinyin(hans, style=TONE2) [u'jie2', u'zi3'] >> load_phrases_dict({u'桔子': [[u'jú'], [u'zǐ']]}) >> lazy_pinyin(hans, style=TONE2) [u'ju2', u'zi3'] 未安装 jieba 分词模块 and/or 不支持分词的词组 >> from pypinyin import lazy_pinyin, load_phrases_dict, TONE2, load_single_dict >> hans = u'还没' >> lazy_pinyin(hans, style=TONE2)