def get_pinyin(): from pypinyin.constants import PINYIN_DICT from pypinyin.style import convert pinyin_set_raw = {p for pin in PINYIN_DICT.values() for p in pin.split(",") if p.strip()} pinyin_set = {convert(p, 8, True) for p in pinyin_set_raw} pin_set = {convert(p, 3, True) for p in pinyin_set_raw} yin_set = {convert(p, 9, True) for p in pinyin_set_raw}
def get_pinyin_for_match(pinyin_tone_list): result = {} for pinyin_tone in pinyin_tone_list: initial = convert(pinyin_tone, strict=True, style=Style.INITIALS) final = convert(pinyin_tone, strict=True, style=Style.FINALS) complete = '' if not initial and not final: # handle possible bad case complete = pinyin_tone else: complete = f"{initial}{final}" if complete not in result: result[complete] = (initial, final) return result
def to_fixed(pinyin, style, strict=True): """根据拼音风格格式化带声调的拼音. :param pinyin: 单个拼音 :param style: 拼音风格 :param strict: 是否严格遵照《汉语拼音方案》来处理声母和韵母 :return: 根据拼音风格格式化后的拼音字符串 :rtype: unicode """ return convert(pinyin, style=style, strict=strict, default=pinyin)
def create_poly_dic(): poly_dict = defaultdict(list) with codecs.open("polyphones.txt", 'r', encoding='utf-8') as f: lines = f.readlines() for item in [x for x in lines if x != '\n']: words = item.strip().split() for p in words[1].split(','): tmp_p = style.convert(p, style=8, strict=False) # 注意:原始文件中轻声是没有音调的,但在我们的数据集中轻声是用5表示的 if tmp_p[-1] not in ['1','2','3','4']: tmp_p += '5' poly_dict[words[-1]].append(tmp_p) json_str = json.dumps(poly_dict, ensure_ascii=False, indent=2) with open('polyphones.json', "w") as json_file: json_file.write(json_str)
def test_finals_tone3_no_final(): assert convert('ń', Style.FINALS_TONE3, True, None) == '' assert convert('ń', Style.FINALS_TONE3, False, None) == 'n2'
def test_finals_tone3_no_final(): assert convert('ń', Style.FINALS_TONE3, True, None) == 'n2'