Example #1
0
def test_custom_pinyin_dict():
    hans = '桔'
    try:
        assert lazy_pinyin(hans, style=TONE2) == ['ju2']
    except AssertionError:
        pass
    load_single_dict({ord('桔'): 'jú,jié'})
    assert lazy_pinyin(hans, style=TONE2) == ['ju2']
Example #2
0
def rectify():
	pypinyin.load_single_dict({
		ord('的'):'de,di',
		ord('地'):'de,di',
		ord('了'):'le,liao',
		ord('着'):'zhe,zhuo',
		ord('还'):'hai,huan'
	})
Example #3
0
def test_custom_pinyin_dict():
    hans = '桔'
    try:
        assert lazy_pinyin(hans, style=TONE2) == ['ju2']
    except AssertionError:
        pass
    load_single_dict({ord('桔'): 'jú,jié'})
    assert lazy_pinyin(hans, style=TONE2) == ['ju2']
Example #4
0
    def __init__(self):
        self.textReformer = text_reformer.TextReformer()

        self.pinyin_part = pinyin_utility.load_pinyin_part_dict()

        self.wordPart = word_part.WordPart()

        pypinyin.load_single_dict(pinyin_dict={0x55ef: u"en"})
        pypinyin.load_phrases_dict(
            phrases_dict={
                u'嗯哪': [[u'en'], [u'na']],
                u'人生何处不相逢': [[u'ren'], [u'sheng'], [u'he'], [u'chu'], [u'bu'],
                             [u'xiang'], [u'feng']]
            })
Example #5
0
def hz2pinyin():  # 生成拼音验证集、测试集、训练集
    load_single_dict({ord('嗯'): 'en'})
    with open('pyin_hz_data/dev.pyin', 'w') as f:
        for line in open('pyin_hz_data/dev.hz'):
            pinyinList = S('{}'.format(line.strip()),
                           separator='',
                           strict=False)
            f.write('{}\n'.format(pinyinList))
    with open('pyin_hz_data/test.pyin', 'w') as f:
        for line in open('pyin_hz_data/test.hz'):
            pinyinList = S('{}'.format(line.strip()),
                           separator='',
                           strict=False)
            f.write('{}\n'.format(pinyinList))
    with open('pyin_hz_data/train.pyin', 'w') as f:
        for line in open('pyin_hz_data/train.hz'):
            pinyinList = S('{}'.format(line.strip()),
                           separator='',
                           strict=False)
            f.write('{}\n'.format(pinyinList))
Example #6
0
def py(s):
    '''
    汉字拼音大写首字母缩写
    '''
    load_single_dict({ord('长'): 'cháng,zhǎng'})  # 调整 "长" 字的拼音顺序
    return ''.join(lazy_pinyin(s, style=Style.FIRST_LETTER))
Example #7
0
def active():
    """
    激活自定义单词多音字。
    """
    load_single_dict(SINGLE_DICTIONARY)
def pypinyin_fix():
    pypinyin.load_phrases_dict({"哪些": [["na"], ["xie"]]})
    pypinyin.load_phrases_dict({"哪个": [["na"], ["ge"]]})
    pypinyin.load_phrases_dict({"那些": [["na"], ["xie"]]})
    pypinyin.load_phrases_dict({"白干": [["bai"], ["gan"]]})
    pypinyin.load_phrases_dict({"寻思": [["xun"], ["si"]]})
    pypinyin.load_phrases_dict({"清寒": [["qing"], ["han"]]})
    pypinyin.load_phrases_dict({"补齐": [["bu"], ["qi"]]})
    pypinyin.load_phrases_dict({"添砖加瓦": [["tian"], ["zhuan"], ["jia"], ["wa"]]})
    pypinyin.load_phrases_dict({"敬业乐群": [["jing"], ["ye"], ["le"], ["qun"]]})
    pypinyin.load_phrases_dict({"物竞天择": [["wu"], ["jing"], ["tian"], ["ze"]]})
    pypinyin.load_phrases_dict({"心存疑虑": [["xin"], ["cun"], ["yi"], ["lv"]]})
    pypinyin.load_phrases_dict({"避免麻烦": [["bi"], ["mian"], ["ma"], ["fan"]]})
    pypinyin.load_phrases_dict({"叶落归根": [["ye"], ["luo"], ["gui"], ["gen"]]})
    pypinyin.load_phrases_dict({"地动山摇": [["di"], ["dong"], ["shan"], ["yao"]]})
    pypinyin.load_single_dict({ord("帧"): "zhen"})
    pypinyin.load_single_dict({ord("霰"): "xian"})
    pypinyin.load_single_dict({ord("珩"): "heng"})
    pypinyin.load_single_dict({ord("嗯"): "en"})
    pypinyin.load_single_dict({ord("嗲"): "dia"})
    pypinyin.load_single_dict({ord("豉"): "chi"})
    pypinyin.load_single_dict({ord("聒"): "guo"})
使用其他分词模块:

安装分词模块,比如 pip install snownlp ;

使用经过分词处理的 字符串列表 作参数:

>> from pypinyin import lazy_pinyin, TONE2
>> from snownlp import SnowNLP
>> hans = u'音乐123'
>> hans_seg = SnowNLP(hans).words  # 分词处理
>> hans_seg
[u'\u97f3\u4e50', u'123']
>> lazy_pinyin(hans_seg, style=TONE2)
[u'yi1n', u'yue4', u'123']
自定义拼音库
如果对结果不满意,可以通过 load_single_dict() 或 load_phrases_dict() 以自定义拼音库的方式修正结果:

安装了 jieba 分词模块并且支持分词的词组

>> from pypinyin import lazy_pinyin, load_phrases_dict, TONE2
>> hans = u'桔子'
>> lazy_pinyin(hans, style=TONE2)
[u'jie2', u'zi3']
>> load_phrases_dict({u'桔子': [[u'jú'], [u'zǐ']]})
>> lazy_pinyin(hans, style=TONE2)
[u'ju2', u'zi3']
未安装 jieba 分词模块 and/or 不支持分词的词组

>> from pypinyin import lazy_pinyin, load_phrases_dict, TONE2, load_single_dict
>> hans = u'还没'
>> lazy_pinyin(hans, style=TONE2)
Example #10
0
#coding=utf-8
import sys
import codecs
reload(sys)  
sys.setdefaultencoding('utf8')
import win32com.client
app = win32com.client.Dispatch('Indesign.Application')
from pypinyin import pinyin, lazy_pinyin, load_single_dict

myTitles=app.FindGrep()
#设置分类错误的拼音
load_single_dict({0x7684:u'de,d\xed'})


#处理拼音格式
def addPinyin(sometext):
    mylist=pinyin(sometext,heteronym=True)
    str=u''
    for pp in mylist:
        str+=pp[0]+u' '
#去除最后空格
    
    return str.rstrip()
#添加拼音

for myT in myTitles:
    myT.Texts[0].RubyFlag=True
    myT.Texts[0].RubyString=addPinyin(myT.Texts[0].Contents)
  
def tunepypinyin():
    load_single_dict({ord(u"的"): u"de0,di2"})
    load_single_dict({ord(u"得"): u"de2,dei3,de0"})
    load_single_dict({ord(u"了"): u"le0,liao3"})
Example #12
0
def test_custom_pinyin_dict_tone2():
    load_single_dict({ord('桔'): 'ce4,si4'}, style='tone2')
    assert lazy_pinyin('桔', style=TONE2) == ['ce4']
    assert pinyin('桔') == [['cè']]
Example #13
0
def load_luna_dict():
    single_dict = {}
    phrases_dict = {}
    # 朙月拼音是专为繁体字设计的字典, 里面的简体字被看成"被大陸簡化字借用字形的傳承字"标注的是"古音"
    # 直接用来处理带简体字的zhwiki效果惨不忍睹(-_-#), 这里使用opencc尝试规避该问题
    luna_dict = {}
    luna_dict_simple = {}
    with open('./rime-luna-pinyin/luna_pinyin.dict.yaml', mode='r') as f:
        for line in f:
            match = PATTERN_RIME_DICT_ITEM.match(line)
            if match:
                item = Dict(match.groupdict())
                # item中的words字段进用来debug时追踪item的来源
                word = item['word']
                item.pop('word')
                item.words = word
                item.percent = float(item.percent) if item.percent is not None else 100

                if luna_dict.get(word) is None:
                    luna_dict[word] = [item]
                else:
                    # 多音字
                    luna_dict[word].append(item)

                word_simple = _TO_SIMPLIFIED_CHINESE.convert(word)
                if word != word_simple:
                    item_simple = Dict(item)
                    if luna_dict_simple.get(word_simple) is None:
                        luna_dict_simple[word_simple] = [item_simple]
                    else:
                        # 多繁转一简后同音的情况, 此时应该将词频累加
                        for exist_item in luna_dict_simple[word_simple]:
                            if exist_item.pinyin == item_simple.pinyin:
                                exist_item.percent += item_simple.percent
                                exist_item.words += item_simple.words
                                # logging.info(f'exist_item: {exist_item}')
                                break
                        else:
                            luna_dict_simple[word_simple].append(item_simple)
    # 使用简体字的注音覆盖繁体字的注音, 则那些"被大陸簡化字借用字形的傳承字"的注音大多会被覆盖掉...
    luna_dict.update(luna_dict_simple)
    for (word, items) in luna_dict.items():
        for item in items:
            if item.percent < 5:
                # 排除低频词
                continue
            if len(word) == 1:
                codePoint = ord(word)
                if single_dict.get(codePoint) is None:
                    single_dict[codePoint] = item.pinyin
                else:
                    single_dict[codePoint] = f'{single_dict[codePoint]},{item.pinyin}'
            else:
                w = item.pinyin.split(' ')
                if phrases_dict.get(word) is None:
                    phrases_dict[word] = [[it] for it in w]
                elif len(phrases_dict[word]) == len(w):
                    for i in range(len(w)):
                        phrases_dict[word][i].append(w[i])
                else:
                    logging.warn(f'invalid pinyin: {word} -> {item}')

    # 移除内置单字词典的多音字
    for (word, pinyins) in PINYIN_DICT.items():
        pinyin_list = pinyins.split(',')
        if len(pinyin_list) > 1:
            PINYIN_DICT[word] = pinyin_list[0]
    # 移除内置词组的多音字
    for (word, phrases) in PHRASES_DICT.items():
        for p in phrases:
            while(len(p) > 1):
                p.pop()
    # 加载luna词典
    load_single_dict(single_dict)
    load_phrases_dict(phrases_dict)
Example #14
0
#coding=utf-8
import pypinyin
import word_part
import numpy as np
import pinyin_utility
pypinyin.load_single_dict(pinyin_dict={0x55ef: u"en"})
pypinyin.load_phrases_dict(
    phrases_dict={
        u'嗯哪': [[u'en'], [u'na']],
        u'人生何处不相逢': [[u'ren'], [u'sheng'], [u'he'], [u'chu'], [u'bu'],
                     [u'xiang'], [u'feng']]
    })
pinyin_part_dict = pinyin_utility.load_pinyin_part_dict()
wp = word_part.WordPart()


def generate_vocab_vec():
    raw_vocab_list, raw_vocab, size1 = load_w2v('../../model/raw.txt')
    pinyin_vocab_list, pinyin_vocab, size2 = load_w2v('../../model/pinyin.txt')
    part_vocab_list, part_vocab, size3 = load_w2v('../../model/part.txt')
    fw1 = open('../../model/vocab.txt', 'w')
    fw2 = open('../../model/vec.txt', 'w')
    size = size1 + size2 + size3
    fw2.write(str(len(raw_vocab_list)) + ' ' + str(size) + '\n')
    fw1.write('UNK\n')
    unk_list = [0] * size
    unk = np.asarray(unk_list, dtype=np.float32).tolist()
    fw2.write(' '.join([str(i) for i in unk]) + '\n')
    for word in raw_vocab:
        word_pinyin = pypinyin.lazy_pinyin(word, errors=lambda x: u'ng')
        try:
Example #15
0
#coding=utf-8
import sys  
reload(sys)  
sys.setdefaultencoding('utf8')
import win32com.client
import codecs
app = win32com.client.Dispatch('Indesign.Application')
from pypinyin import pinyin, lazy_pinyin, load_single_dict
myfile= codecs.open('C:\\Users\\jjp\\py4id\\readTitle.jsx','r','utf-8')
ff=myfile.read()
myTitles=app.DoScript(ff,1246973031)
load_single_dict({u'的':[[u'de'],[u'd\xed']]})
#处理拼音格式
def addPinyin(sometext):
    mylist=pinyin(sometext, heteronym=True)
    str=u''
    for pp in mylist:
        str+=pp[0]+u' '
    print str.rstrip()
    return str.rstrip()
#添加拼音

for myT in myTitles:
    myT.Texts[0].RubyFlag=True
    myT.Texts[0].RubyString=addPinyin(myT.Texts[0].Contents)
  
Example #16
0

print(pinyin('中心', style=Style.TONE2, heteronym=True))

print(lazy_pinyin('中心'))

print(pinyin('翟偲翀', heteronym=True))

from pypinyin import load_phrases_dict, load_single_dict
# load_phrases_dict({'步履蹒跚': [['bù'], ['lǚ'], ['pán'], ['shān']]})
# load_single_dict({ord('蹒'): 'pán'})
# print(pinyin('步履蹒跚'))

single_sur_dict = {}
phrases_sur_dict = {}
with open('sur_pinyin.dict', 'r') as fr:
    for line in fr.readlines():
        words = line.strip().split('\t')
        if len(words[0]) > 1:
            spys = words[1].split('/')
            phrases_sur_dict[words[0]] = []
            for spy in spys:
                phrases_sur_dict[words[0]].append([spy])
        else:
            single_sur_dict[ord(words[0])] = words[1]

load_phrases_dict(phrases_sur_dict)
load_single_dict(single_sur_dict)

print(pinyin('翟偲翀'))
from pypinyin import lazy_pinyin, load_phrases_dict, Style, load_single_dict
hans = '桔子'
hans1 = lazy_pinyin(hans, style=Style.TONE2)
print(hans1)
load_phrases_dict({'桔子': [['jié'], ['zǐ']]})  # 增加 "桔子" 词组,故意使用一个错误的拼音
hans2 = lazy_pinyin(hans, style=Style.TONE2)
print(hans2)
hanm = '还没'
hanm1 = lazy_pinyin(hanm, style=Style.TONE2)
print(hanm1)
load_single_dict({ord('还'): 'hái,huán'})  # 调整 "还" 字的拼音顺序
hanm2 = lazy_pinyin('还没', style=Style.TONE2)
print(hanm2)
Example #18
0
                next_2_word = seg[index + 2][0]
            if (next_pos == 'a' and next_2_word not in not_in_after_2
                    and next_2_pos != 'uv') or (index > 0 and seg[index - 1][1]
                                                in before_de):
                py = 'de'
            elif (index == 0 or seg[index - 1][0] != '不') and (next_word + next_2_word) not in not_in_2_next and next_word not in not_in_2_next\
            and ((next_pos in after_dei) or (next_2_pos in after_2_dei)):
                py = 'děi'
        return py


print('Loading Pinyin Converter...')
from pypinyin import pinyin, lazy_pinyin, load_phrases_dict, Style, load_single_dict
print('Loading custom dictionary...')
load_phrases_dict(phrases_dict)
load_single_dict(pinyin_dict)
print('Finalizing...')
add_apostrophe = lambda x: re.sub(r'^([aoeāáǎàōóǒòēéěè])', r"'\1", x)
add_apostrophe = np.vectorize(add_apostrophe)
import opencc
converter = opencc.OpenCC('s2tw.json')
converter_t2s = opencc.OpenCC('t2s.json')
print('Hanzi2Phonetics is ready.')

# if segmentation_package == 'hanlp':
#     def to_pinyin(x):
#         x, beginning_list = breakdown(x.translate(punctuations_cn2py))
#         ans = []
#         seg, tag = segment(x)
#         for (seg_part, tag_part, is_beginning_of_sentence) in zip(s, t, beginning_list):
#             l = len(seg_part)
Example #19
0
# coding:gbk

import json
import os
import re
# from xpinyin import Pinyin
from pypinyin import lazy_pinyin, load_single_dict

wd = dict()  # word_list
py = dict()

load_single_dict({ord('帧'): 'zhen'})
load_single_dict({ord('嗯'): 'en'})
load_single_dict({ord('嗲'): 'dia'})
load_single_dict({ord('芎'): 'xiong'})
load_single_dict({ord('菹'): 'zu'})
load_single_dict({ord('呒'): 'fu'})
load_single_dict({ord('丬'): 'pan'})
load_single_dict({ord('嬷'): 'mo'})
load_single_dict({ord('珩'): 'heng'})
load_single_dict({ord('砉'): 'hua'})
load_single_dict({ord('碡'): 'zhou'})
load_single_dict({ord('聒'): 'guo'})
load_single_dict({ord('蚵'): 'ke'})
load_single_dict({ord('豉'): 'chi'})
load_single_dict({ord('霰'): 'xian'})


def load_pyhz_list(pinyin2hanzi_path):
    pinyin2hanzi = open(pinyin2hanzi_path)
    hanzi = open("D://project/Aiwork/lib/一二级汉字表.txt").read()
Example #20
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import re
import sys
import collections

os.environ['PYPINYIN_NO_DICT_COPY'] = '1'

import pypinyin
import terra_pinyin

pypinyin.load_single_dict(terra_pinyin.pinyin_dict)
pypinyin.load_phrases_dict(terra_pinyin.phrases_dict)

RE_UCJK = re.compile('([\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff'
                     '\U00020000-\U0002A6DF\U0002A700-\U0002B73F'
                     '\U0002B740-\U0002B81F\U0002B820-\U0002CEAF'
                     '\U0002F800-\U0002FA1F]+)')

with open(sys.argv[1], 'w', encoding='utf-8') as w1, \
    open(sys.argv[2], 'w', encoding='utf-8') as w2, \
    open(sys.argv[3], 'w', encoding='utf-8') as w3:
    for ln in sys.stdin.buffer:
        for seg in RE_UCJK.findall(ln.decode('utf-8', 'ignore')):
            pinyins = pypinyin.pinyin(seg,
                                      style=pypinyin.Style.NORMAL,
                                      errors='ignore')
            length = len(seg)
            if length != len(pinyins):
Example #21
0
def test_custom_pinyin_dict_tone2():
    load_single_dict({ord('桔'): 'ce4,si4'}, style='tone2')
    assert lazy_pinyin('桔', style=TONE2) == ['ce4']
    assert pinyin('桔') == [['cè']]
Example #22
0
from pypinyin import pinyin, Style, load_single_dict
import json
import os

SAVE_DIR = os.path.dirname(__file__)
pinyin2num_dict_fp = os.path.join(SAVE_DIR,
                                  'pinyin2num_dict.json')  # 拼音-->数字 字典。
PinYinTable_fp = os.path.join(
    SAVE_DIR, 'PinYinTable_modern.csv')  # 拼音与音素原始表,若不存在上述文件,则根据此表生成

if os.path.basename(PinYinTable_fp) == 'PinYinTable_classic.csv':
    load_single_dict({
        ord('嗯'): 'en2',
        ord('哟'): 'you4',
    })
else:
    load_single_dict({
        ord('嗯'): 'ng2',
    })


def prepare_pinyinbase():
    tables = []
    with open(PinYinTable_fp, 'r', encoding='utf8') as f:
        lines = f.read().split('\n')
        for line in lines:
            tables.append(line.split(','))

    pinyin2num_dict = {}
    k = 0
    for i in range(1, len(tables)):