def text_to_sequence(text, speaker_name='baker', inference=False):
    sequence = []
    tmp = ""
    if "baker" == speaker_name:
        if inference:
            my_pinyin = Pinyin(MyConverter())
            pinyin = my_pinyin.pinyin(text, style=Style.TONE3,
                                      # errors="ignore"
                                      errors=alpha_handler
                                      )
            print("text_to_sequence pinyin=", pinyin)
            new_pinyin = []
            for x in pinyin:
                x = "".join(x)
                if "#" not in x:
                    new_pinyin.append(x)
            print("text_to_sequence new_pinyin=", new_pinyin)
            phonemes = get_phoneme_from_char_and_pinyin(text, new_pinyin)
            text = " ".join(phonemes)
            print(f"phoneme seq: {text}")
        try:
            for symbol in text.split():
                tmp = symbol
                idx = symbol_to_id[symbol]
                sequence.append(idx)
        except Exception as e:
            print("text_to_sequence error", tmp)
    else:
        if not inference:  # in train mode text should be already transformed to phonemes
            sequence = symbols_to_ids(clean_g2p(text.strip().split(" ")))
        else:
            sequence = inference_text_to_seq(text)
    # add eos tokens
    sequence += ['eos_id']
    return sequence
    def get_audio_dict(self) -> (dict, dict, dict):
        """
        获取原始数据
        :return:
        """
        use_type = self._use_type
        dataset_path = self._dataset_path
        participle = self._participle

        id_path_dict = {}
        id_hanzi_dict = {}
        id_pinyin_dict = {}

        for use_type in use_type:
            with open(file=dataset_path.joinpath(use_type + '.txt'),
                      mode='r',
                      encoding='utf-8') as txt_file:
                for line in txt_file.readlines():
                    # 生成id(str)
                    id = line.split('\t')[0]

                    # 生成audio路径
                    path = dataset_path.joinpath(use_type, id)

                    # 是否需要进行分词 生成汉字(str)
                    hanzi = line.split('\t')[1].strip('\n')
                    if participle:
                        hanzi = list(jieba.cut(hanzi, cut_all=False))
                    else:
                        hanzi = hanzi.split(' ')

                    # 生成拼音(str)
                    pinyin_dict = DataUtils.get_pinyin_dict()
                    my_pinyin = Pinyin(MyConverter())
                    pinyin = ''
                    for token in hanzi:
                        for char in my_pinyin.pinyin(token,
                                                     style=Style.TONE3,
                                                     heteronym=False):
                            if char[0] not in pinyin_dict:
                                pinyin += ('_' + ' ')
                            else:
                                pinyin += (char[0] + ' ')

                    id_path_dict[id] = path
                    id_hanzi_dict[id] = ' '.join(list(''.join(hanzi)))
                    id_pinyin_dict[id] = pinyin

        return id_path_dict, id_hanzi_dict, id_pinyin_dict
Exemple #3
0
def test_use_pre_seg_to_skip_seg():
    class A(Pinyin):
        def pre_seg(self, hans, **kwargs):
            return ['a', 'b', 'c']

    mypinyin = A()

    assert Pinyin().pinyin('测试') == [['cè'], ['shì']]
    assert mypinyin.pinyin('测试') == [['a'], ['b'], ['c']]
Exemple #4
0
def test_use_post_seg_to_change_seg_result():
    class A(Pinyin):
        def post_seg(self, hans, seg_data, **kwargs):
            return ['a', 'b', 'c']

    mypinyin = A()

    assert Pinyin().pinyin('测试') == [['cè'], ['shì']]
    assert mypinyin.pinyin('测试') == [['a'], ['b'], ['c']]
Exemple #5
0
def test_use_seg_function_change_seg_func():
    def seg(han):
        return ['a', 'b', 'c']

    class A(Pinyin):
        def get_seg(self):
            return seg

    mypinyin = A()

    assert Pinyin().pinyin('测试') == [['cè'], ['shì']]
    assert mypinyin.pinyin('测试') == [['a'], ['b'], ['c']]
Exemple #6
0
 def get_pinyin_parser(self):
     my_pinyin = Pinyin(MyConverter())
     pinyin = my_pinyin.pinyin
     return pinyin
Exemple #7
0
from pypinyin import lazy_pinyin, Style
from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
from pypinyin.contrib.uv import V2UMixin
from pypinyin.converter import DefaultConverter
from pypinyin.core import Pinyin


class MyConverter(NeutralToneWith5Mixin, DefaultConverter):
    pass


class HerConverter(NeutralToneWith5Mixin, V2UMixin, DefaultConverter):
    pass


my_pinyin = Pinyin(MyConverter())
her_pinyin = Pinyin(HerConverter())


def test_neutral_tone_with_5():
    assert lazy_pinyin('好了', style=Style.TONE2) == ['ha3o', 'le']
    assert my_pinyin.lazy_pinyin('好了', style=Style.TONE2) == ['ha3o', 'le5']
    assert her_pinyin.lazy_pinyin('好了', style=Style.TONE2) == ['ha3o', 'le5']

    assert lazy_pinyin('好了') == ['hao', 'le']
    assert my_pinyin.lazy_pinyin('好了') == ['hao', 'le']
    assert her_pinyin.lazy_pinyin('好了') == ['hao', 'le']


@mark.parametrize('input,expected_old, expected_new', [
    ['你好', ['ni3', 'ha3o'], ['ni3', 'ha3o']],
Exemple #8
0
 def __init__(self):
     self.pinyin_parser = Pinyin(MyConverter()).pinyin
     self.pinyin_dict = BAKER_DICT['pinyin_dict']
     self.all_phoneme = self.pinyin_dict.keys()
     self.phoneme_to_id = BAKER_DICT["symbol_to_id"]
import nltk

from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
from pypinyin.converter import DefaultConverter
from pypinyin.core import Pinyin
from pypinyin import Style
from pypinyin.style._utils import get_finals
from pypinyin.style._utils import get_initials
from tacotron_cleaner.cleaners import custom_english_cleaners


class MyConverter(NeutralToneWith5Mixin, DefaultConverter):
    pass


my_pinyin = Pinyin(MyConverter())
pinyin = my_pinyin.pinyin

E_lang_tag = "en_US"

try:
    # For phoneme conversion, use https://github.com/Kyubyong/g2p.
    from g2p_en import G2p

    f_g2p = G2p()
    f_g2p("")
except ImportError:
    raise ImportError(
        "g2p_en is not installed. please run `. ./path.sh && pip install g2p_en`."
    )
except LookupError:
Exemple #10
0
#!/usr/bin/env python
# coding=utf-8
import sys
from pypinyin import pinyin, Style
from pypinyin.style._utils import get_initials, get_finals
from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
from pypinyin.converter import DefaultConverter
from pypinyin.core import Pinyin

my_pinyin = Pinyin()
pinyin = my_pinyin.pinyin

x = sys.argv[1]

text = pinyin(x, style=Style.TONE3)
text = [c[0] for c in text]
clean_content = []
for c in text:
    c_init = get_initials(c, strict=True)
    c_final = get_finals(c, strict=True)
    for c in [c_init, c_final]:
        if len(c) == 0:
            continue
        c = c.replace("ü", "v")
        c = c.replace("ui", "uei")
        c = c.replace("un", "uen")
        c = c.replace("iu", "iou")
        if "5" in c:
            c = c.replace("5", "") + "5"
        clean_content.append(c)
print(' '.join(clean_content))