def text_to_sequence(text, speaker_name='baker', inference=False): sequence = [] tmp = "" if "baker" == speaker_name: if inference: my_pinyin = Pinyin(MyConverter()) pinyin = my_pinyin.pinyin(text, style=Style.TONE3, # errors="ignore" errors=alpha_handler ) print("text_to_sequence pinyin=", pinyin) new_pinyin = [] for x in pinyin: x = "".join(x) if "#" not in x: new_pinyin.append(x) print("text_to_sequence new_pinyin=", new_pinyin) phonemes = get_phoneme_from_char_and_pinyin(text, new_pinyin) text = " ".join(phonemes) print(f"phoneme seq: {text}") try: for symbol in text.split(): tmp = symbol idx = symbol_to_id[symbol] sequence.append(idx) except Exception as e: print("text_to_sequence error", tmp) else: if not inference: # in train mode text should be already transformed to phonemes sequence = symbols_to_ids(clean_g2p(text.strip().split(" "))) else: sequence = inference_text_to_seq(text) # add eos tokens sequence += ['eos_id'] return sequence
def get_audio_dict(self) -> (dict, dict, dict): """ 获取原始数据 :return: """ use_type = self._use_type dataset_path = self._dataset_path participle = self._participle id_path_dict = {} id_hanzi_dict = {} id_pinyin_dict = {} for use_type in use_type: with open(file=dataset_path.joinpath(use_type + '.txt'), mode='r', encoding='utf-8') as txt_file: for line in txt_file.readlines(): # 生成id(str) id = line.split('\t')[0] # 生成audio路径 path = dataset_path.joinpath(use_type, id) # 是否需要进行分词 生成汉字(str) hanzi = line.split('\t')[1].strip('\n') if participle: hanzi = list(jieba.cut(hanzi, cut_all=False)) else: hanzi = hanzi.split(' ') # 生成拼音(str) pinyin_dict = DataUtils.get_pinyin_dict() my_pinyin = Pinyin(MyConverter()) pinyin = '' for token in hanzi: for char in my_pinyin.pinyin(token, style=Style.TONE3, heteronym=False): if char[0] not in pinyin_dict: pinyin += ('_' + ' ') else: pinyin += (char[0] + ' ') id_path_dict[id] = path id_hanzi_dict[id] = ' '.join(list(''.join(hanzi))) id_pinyin_dict[id] = pinyin return id_path_dict, id_hanzi_dict, id_pinyin_dict
def test_use_pre_seg_to_skip_seg(): class A(Pinyin): def pre_seg(self, hans, **kwargs): return ['a', 'b', 'c'] mypinyin = A() assert Pinyin().pinyin('测试') == [['cè'], ['shì']] assert mypinyin.pinyin('测试') == [['a'], ['b'], ['c']]
def test_use_post_seg_to_change_seg_result(): class A(Pinyin): def post_seg(self, hans, seg_data, **kwargs): return ['a', 'b', 'c'] mypinyin = A() assert Pinyin().pinyin('测试') == [['cè'], ['shì']] assert mypinyin.pinyin('测试') == [['a'], ['b'], ['c']]
def test_use_seg_function_change_seg_func(): def seg(han): return ['a', 'b', 'c'] class A(Pinyin): def get_seg(self): return seg mypinyin = A() assert Pinyin().pinyin('测试') == [['cè'], ['shì']] assert mypinyin.pinyin('测试') == [['a'], ['b'], ['c']]
def get_pinyin_parser(self): my_pinyin = Pinyin(MyConverter()) pinyin = my_pinyin.pinyin return pinyin
from pypinyin import lazy_pinyin, Style from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin from pypinyin.contrib.uv import V2UMixin from pypinyin.converter import DefaultConverter from pypinyin.core import Pinyin class MyConverter(NeutralToneWith5Mixin, DefaultConverter): pass class HerConverter(NeutralToneWith5Mixin, V2UMixin, DefaultConverter): pass my_pinyin = Pinyin(MyConverter()) her_pinyin = Pinyin(HerConverter()) def test_neutral_tone_with_5(): assert lazy_pinyin('好了', style=Style.TONE2) == ['ha3o', 'le'] assert my_pinyin.lazy_pinyin('好了', style=Style.TONE2) == ['ha3o', 'le5'] assert her_pinyin.lazy_pinyin('好了', style=Style.TONE2) == ['ha3o', 'le5'] assert lazy_pinyin('好了') == ['hao', 'le'] assert my_pinyin.lazy_pinyin('好了') == ['hao', 'le'] assert her_pinyin.lazy_pinyin('好了') == ['hao', 'le'] @mark.parametrize('input,expected_old, expected_new', [ ['你好', ['ni3', 'ha3o'], ['ni3', 'ha3o']],
def __init__(self): self.pinyin_parser = Pinyin(MyConverter()).pinyin self.pinyin_dict = BAKER_DICT['pinyin_dict'] self.all_phoneme = self.pinyin_dict.keys() self.phoneme_to_id = BAKER_DICT["symbol_to_id"]
import nltk from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin from pypinyin.converter import DefaultConverter from pypinyin.core import Pinyin from pypinyin import Style from pypinyin.style._utils import get_finals from pypinyin.style._utils import get_initials from tacotron_cleaner.cleaners import custom_english_cleaners class MyConverter(NeutralToneWith5Mixin, DefaultConverter): pass my_pinyin = Pinyin(MyConverter()) pinyin = my_pinyin.pinyin E_lang_tag = "en_US" try: # For phoneme conversion, use https://github.com/Kyubyong/g2p. from g2p_en import G2p f_g2p = G2p() f_g2p("") except ImportError: raise ImportError( "g2p_en is not installed. please run `. ./path.sh && pip install g2p_en`." ) except LookupError:
#!/usr/bin/env python # coding=utf-8 import sys from pypinyin import pinyin, Style from pypinyin.style._utils import get_initials, get_finals from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin from pypinyin.converter import DefaultConverter from pypinyin.core import Pinyin my_pinyin = Pinyin() pinyin = my_pinyin.pinyin x = sys.argv[1] text = pinyin(x, style=Style.TONE3) text = [c[0] for c in text] clean_content = [] for c in text: c_init = get_initials(c, strict=True) c_final = get_finals(c, strict=True) for c in [c_init, c_final]: if len(c) == 0: continue c = c.replace("ü", "v") c = c.replace("ui", "uei") c = c.replace("un", "uen") c = c.replace("iu", "iou") if "5" in c: c = c.replace("5", "") + "5" clean_content.append(c) print(' '.join(clean_content))