Esempio n. 1
0
def main():
    """Run phoneme conversion."""
    parser = argparse.ArgumentParser()
    parser.add_argument("--g2p", type=str, required=True, help="G2P type.")
    parser.add_argument("--cleaner",
                        type=str,
                        default=None,
                        help="Cleaner type.")
    parser.add_argument("--nj",
                        type=int,
                        default=4,
                        help="Number of parallel jobs.")
    parser.add_argument("in_text", type=str, help="Input kaldi-style text.")
    parser.add_argument("out_text", type=str, help="Output kaldi-style text.")
    args = parser.parse_args()

    phoneme_tokenizer = PhonemeTokenizer(args.g2p)
    cleaner = None
    if args.cleaner is not None:
        cleaner = TextCleaner(args.cleaner)
    with codecs.open(args.in_text, encoding="utf8") as f:
        lines = [line.strip() for line in f.readlines()]
    text = {line.split()[0]: " ".join(line.split()[1:]) for line in lines}
    if cleaner is not None:
        text = {k: cleaner(v) for k, v in text.items()}
    phns_list = Parallel(n_jobs=args.nj)([
        delayed(phoneme_tokenizer.text2tokens)(sentence)
        for sentence in text.values()
    ])
    with codecs.open(args.out_text, "w", encoding="utf8") as g:
        for utt_id, phns in zip(text.keys(), phns_list):
            g.write(f"{utt_id} " + " ".join(phns) + "\n")
Esempio n. 2
0
def build_tokenizer(
    token_type: str,
    bpemodel: Union[Path, str, Iterable[str]] = None,
    non_linguistic_symbols: Union[Path, str, Iterable[str]] = None,
    remove_non_linguistic_symbols: bool = False,
    space_symbol: str = "<space>",
    delimiter: str = None,
    g2p_type: str = None,
) -> AbsTokenizer:
    """A helper function to instantiate Tokenizer"""
    assert check_argument_types()
    if token_type == "bpe":
        if bpemodel is None:
            raise ValueError('bpemodel is required if token_type = "bpe"')

        if remove_non_linguistic_symbols:
            raise RuntimeError(
                "remove_non_linguistic_symbols is not implemented for token_type=bpe"
            )
        return SentencepiecesTokenizer(bpemodel)

    elif token_type == "word":
        if remove_non_linguistic_symbols and non_linguistic_symbols is not None:
            return WordTokenizer(
                delimiter=delimiter,
                non_linguistic_symbols=non_linguistic_symbols,
                remove_non_linguistic_symbols=True,
            )
        else:
            return WordTokenizer(delimiter=delimiter)

    elif token_type == "char":
        return CharTokenizer(
            non_linguistic_symbols=non_linguistic_symbols,
            space_symbol=space_symbol,
            remove_non_linguistic_symbols=remove_non_linguistic_symbols,
        )

    elif token_type == "phn":
        if g2p_type is None:
            raise ValueError("g2p_type is required if token_type=phn")
        return PhonemeTokenizer(
            g2p_type=g2p_type,
            non_linguistic_symbols=non_linguistic_symbols,
            space_symbol=space_symbol,
            remove_non_linguistic_symbols=remove_non_linguistic_symbols,
        )
    else:
        raise ValueError(f"token_mode must be one of bpe, word, char or phn: "
                         f"{token_type}")
Esempio n. 3
0
def test_text2tokens(phoneme_tokenizer: PhonemeTokenizer):
    if phoneme_tokenizer.g2p_type == "g2p_en":
        input = "Hello World"
        output = ["HH", "AH0", "L", "OW1", " ", "W", "ER1", "L", "D"]
    elif phoneme_tokenizer.g2p_type == "pyopenjtalk":
        input = "昔は俺も若かった"
        output = [
            "m",
            "u",
            "k",
            "a",
            "sh",
            "i",
            "w",
            "a",
            "o",
            "r",
            "e",
            "m",
            "o",
            "w",
            "a",
            "k",
            "a",
            "k",
            "a",
            "cl",
            "t",
            "a",
        ]
    elif phoneme_tokenizer.g2p_type == "pyopenjtalk_kana":
        input = "昔は俺も若かった"
        output = ["ム", "カ", "シ", "ワ", "オ", "レ", "モ", "ワ", "カ", "カ", "ッ", "タ"]
    else:
        raise NotImplementedError
    assert phoneme_tokenizer.text2tokens(input) == output
Esempio n. 4
0
def test_text2tokens(phoneme_tokenizer: PhonemeTokenizer):
    if phoneme_tokenizer.g2p_type is None:
        input = "HH AH0 L OW1   W ER1 L D"
        output = ["HH", "AH0", "L", "OW1", " ", "W", "ER1", "L", "D"]
    elif phoneme_tokenizer.g2p_type == "g2p_en":
        input = "Hello World"
        output = ["HH", "AH0", "L", "OW1", " ", "W", "ER1", "L", "D"]
    elif phoneme_tokenizer.g2p_type == "g2p_en_no_space":
        input = "Hello World"
        output = ["HH", "AH0", "L", "OW1", "W", "ER1", "L", "D"]
    elif phoneme_tokenizer.g2p_type == "pyopenjtalk":
        input = "昔は、俺も若かった"
        output = [
            "m",
            "u",
            "k",
            "a",
            "sh",
            "i",
            "w",
            "a",
            "pau",
            "o",
            "r",
            "e",
            "m",
            "o",
            "w",
            "a",
            "k",
            "a",
            "k",
            "a",
            "cl",
            "t",
            "a",
        ]
    elif phoneme_tokenizer.g2p_type == "pyopenjtalk_kana":
        input = "昔は、俺も若かった"
        output = [
            "ム", "カ", "シ", "ワ", "、", "オ", "レ", "モ", "ワ", "カ", "カ", "ッ", "タ"
        ]
    elif phoneme_tokenizer.g2p_type == "pyopenjtalk_accent":
        input = "昔は、俺も若かった"
        output = [
            "m",
            "4",
            "-3",
            "u",
            "4",
            "-3",
            "k",
            "4",
            "-2",
            "a",
            "4",
            "-2",
            "sh",
            "4",
            "-1",
            "i",
            "4",
            "-1",
            "w",
            "4",
            "0",
            "a",
            "4",
            "0",
            "o",
            "3",
            "-2",
            "r",
            "3",
            "-1",
            "e",
            "3",
            "-1",
            "m",
            "3",
            "0",
            "o",
            "3",
            "0",
            "w",
            "2",
            "-1",
            "a",
            "2",
            "-1",
            "k",
            "2",
            "0",
            "a",
            "2",
            "0",
            "k",
            "2",
            "1",
            "a",
            "2",
            "1",
            "cl",
            "2",
            "2",
            "t",
            "2",
            "3",
            "a",
            "2",
            "3",
        ]
    elif phoneme_tokenizer.g2p_type == "pyopenjtalk_accent_with_pause":
        input = "昔は、俺も若かった"
        output = [
            "m",
            "4",
            "-3",
            "u",
            "4",
            "-3",
            "k",
            "4",
            "-2",
            "a",
            "4",
            "-2",
            "sh",
            "4",
            "-1",
            "i",
            "4",
            "-1",
            "w",
            "4",
            "0",
            "a",
            "4",
            "0",
            "pau",
            "o",
            "3",
            "-2",
            "r",
            "3",
            "-1",
            "e",
            "3",
            "-1",
            "m",
            "3",
            "0",
            "o",
            "3",
            "0",
            "w",
            "2",
            "-1",
            "a",
            "2",
            "-1",
            "k",
            "2",
            "0",
            "a",
            "2",
            "0",
            "k",
            "2",
            "1",
            "a",
            "2",
            "1",
            "cl",
            "2",
            "2",
            "t",
            "2",
            "3",
            "a",
            "2",
            "3",
        ]
    elif phoneme_tokenizer.g2p_type == "pyopenjtalk_prosody":
        input = "昔は、俺も若かった"
        output = [
            "^",
            "m",
            "u",
            "[",
            "k",
            "a",
            "sh",
            "i",
            "w",
            "a",
            "_",
            "o",
            "[",
            "r",
            "e",
            "m",
            "o",
            "#",
            "w",
            "a",
            "[",
            "k",
            "a",
            "]",
            "k",
            "a",
            "cl",
            "t",
            "a",
            "$",
        ]
    elif phoneme_tokenizer.g2p_type == "pypinyin_g2p":
        input = "卡尔普陪外孙玩滑梯。"
        output = [
            "ka3",
            "er3",
            "pu3",
            "pei2",
            "wai4",
            "sun1",
            "wan2",
            "hua2",
            "ti1",
            "。",
        ]
    elif phoneme_tokenizer.g2p_type == "pypinyin_g2p_phone":
        input = "卡尔普陪外孙玩滑梯。"
        output = [
            "k",
            "a3",
            "er3",
            "p",
            "u3",
            "p",
            "ei2",
            "uai4",
            "s",
            "uen1",
            "uan2",
            "h",
            "ua2",
            "t",
            "i1",
            "。",
        ]
    elif phoneme_tokenizer.g2p_type == "espeak_ng_arabic":
        input = "السلام عليكم"
        output = [
            "ʔ", "a", "s", "s", "a", "l", "ˈaː", "m", "ʕ", "l", "ˈiː", "k", "m"
        ]
    elif phoneme_tokenizer.g2p_type == "espeak_ng_german":
        input = "Das hört sich gut an."
        output = [
            "d",
            "a",
            "s",
            "h",
            "ˈœ",
            "ɾ",
            "t",
            "z",
            "ɪ",
            "ç",
            "ɡ",
            "ˈuː",
            "t",
            "ˈa",
            "n",
            ".",
        ]
    elif phoneme_tokenizer.g2p_type == "espeak_ng_french":
        input = "Bonjour le monde."
        output = ["b", "ɔ̃", "ʒ", "ˈu", "ʁ", "l", "ə-", "m", "ˈɔ̃", "d", "."]
    elif phoneme_tokenizer.g2p_type == "espeak_ng_spanish":
        input = "Hola Mundo."
        output = ["ˈo", "l", "a", "m", "ˈu", "n", "d", "o", "."]
    elif phoneme_tokenizer.g2p_type == "espeak_ng_russian":
        input = "Привет мир."
        output = ["p", "rʲ", "i", "vʲ", "ˈe", "t", "mʲ", "ˈi", "r", "."]
    elif phoneme_tokenizer.g2p_type == "espeak_ng_greek":
        input = "Γειά σου Κόσμε."
        output = ["j", "ˈa", "s", "u", "k", "ˈo", "s", "m", "e", "."]
    elif phoneme_tokenizer.g2p_type == "espeak_ng_finnish":
        input = "Hei maailma."
        output = ["h", "ˈei", "m", "ˈaː", "ɪ", "l", "m", "a", "."]
    elif phoneme_tokenizer.g2p_type == "espeak_ng_hungarian":
        input = "Helló Világ."
        output = ["h", "ˈɛ", "l", "l", "oː", "v", "ˈi", "l", "aː", "ɡ", "."]
    elif phoneme_tokenizer.g2p_type == "espeak_ng_dutch":
        input = "Hallo Wereld."
        output = ["h", "ˈɑ", "l", "oː", "ʋ", "ˈɪː", "r", "ə", "l", "t", "."]
    elif phoneme_tokenizer.g2p_type == "espeak_ng_hindi":
        input = "नमस्ते दुनिया"
        output = [
            "n", "ə", "m", "ˈʌ", "s", "t", "eː", "d", "ˈʊ", "n", "ɪ", "j",
            "ˌaː"
        ]
    elif phoneme_tokenizer.g2p_type == "g2pk":
        input = "안녕하세요 세계입니다."
        output = [
            "ᄋ",
            "ᅡ",
            "ᆫ",
            "ᄂ",
            "ᅧ",
            "ᆼ",
            "ᄒ",
            "ᅡ",
            "ᄉ",
            "ᅦ",
            "ᄋ",
            "ᅭ",
            " ",
            "ᄉ",
            "ᅦ",
            "ᄀ",
            "ᅨ",
            "ᄋ",
            "ᅵ",
            "ᆷ",
            "ᄂ",
            "ᅵ",
            "ᄃ",
            "ᅡ",
            ".",
        ]
    elif phoneme_tokenizer.g2p_type == "g2pk_no_space":
        input = "안녕하세요 세계입니다."
        output = [
            "ᄋ",
            "ᅡ",
            "ᆫ",
            "ᄂ",
            "ᅧ",
            "ᆼ",
            "ᄒ",
            "ᅡ",
            "ᄉ",
            "ᅦ",
            "ᄋ",
            "ᅭ",
            "ᄉ",
            "ᅦ",
            "ᄀ",
            "ᅨ",
            "ᄋ",
            "ᅵ",
            "ᆷ",
            "ᄂ",
            "ᅵ",
            "ᄃ",
            "ᅡ",
            ".",
        ]
    elif phoneme_tokenizer.g2p_type == "espeak_ng_english_us_vits":
        input = "Hello, World."
        output = [
            "h",
            "ə",
            "l",
            "ˈ",
            "o",
            "ʊ",
            ",",
            "<space>",
            "w",
            "ˈ",
            "ɜ",
            "ː",
            "l",
            "d",
            ".",
        ]
    elif phoneme_tokenizer.g2p_type == "korean_jaso":
        input = "나는 학교에 갑니다."
        output = [
            "ᄂ",
            "ᅡ",
            "ᄂ",
            "ᅳ",
            "ᆫ",
            "<space>",
            "ᄒ",
            "ᅡ",
            "ᆨ",
            "ᄀ",
            "ᅭ",
            "ᄋ",
            "ᅦ",
            "<space>",
            "ᄀ",
            "ᅡ",
            "ᆸ",
            "ᄂ",
            "ᅵ",
            "ᄃ",
            "ᅡ",
            ".",
        ]
    elif phoneme_tokenizer.g2p_type == "korean_jaso_no_space":
        input = "나는 학교에 갑니다."
        output = [
            "ᄂ",
            "ᅡ",
            "ᄂ",
            "ᅳ",
            "ᆫ",
            "ᄒ",
            "ᅡ",
            "ᆨ",
            "ᄀ",
            "ᅭ",
            "ᄋ",
            "ᅦ",
            "ᄀ",
            "ᅡ",
            "ᆸ",
            "ᄂ",
            "ᅵ",
            "ᄃ",
            "ᅡ",
            ".",
        ]
    elif phoneme_tokenizer.g2p_type == "is_g2p":
        input = "hlaupa í burtu í dag"
        output = [
            "l_0",
            "9i:",
            ".",
            "p",
            "a",
            ",",
            "i:",
            ",",
            "p",
            "Y",
            "r_0",
            ".",
            "t",
            "Y",
            ",",
            "i:",
            ",",
            "t",
            "a:",
            "G",
        ]
    else:
        raise NotImplementedError
    assert phoneme_tokenizer.text2tokens(input) == output
Esempio n. 5
0
def phoneme_tokenizer(request):
    return PhonemeTokenizer(g2p_type=request.param)
Esempio n. 6
0
def test_token2text(phoneme_tokenizer: PhonemeTokenizer):
    assert phoneme_tokenizer.tokens2text(["a", "b", "c"]) == "abc"
def test_text2tokens(phoneme_tokenizer: PhonemeTokenizer):
    if phoneme_tokenizer.g2p_type is None:
        input = "HH AH0 L OW1 <space> W ER1 L D"
        output = ["HH", "AH0", "L", "OW1", "<space>", "W", "ER1", "L", "D"]
    elif phoneme_tokenizer.g2p_type == "g2p_en":
        input = "Hello World"
        output = ["HH", "AH0", "L", "OW1", " ", "W", "ER1", "L", "D"]
    elif phoneme_tokenizer.g2p_type == "g2p_en_no_space":
        input = "Hello World"
        output = ["HH", "AH0", "L", "OW1", "W", "ER1", "L", "D"]
    elif phoneme_tokenizer.g2p_type == "pyopenjtalk":
        input = "昔は俺も若かった"
        output = [
            "m",
            "u",
            "k",
            "a",
            "sh",
            "i",
            "w",
            "a",
            "o",
            "r",
            "e",
            "m",
            "o",
            "w",
            "a",
            "k",
            "a",
            "k",
            "a",
            "cl",
            "t",
            "a",
        ]
    elif phoneme_tokenizer.g2p_type == "pyopenjtalk_accent":
        input = "昔は俺も若かった"
        output = [
            "m",
            "4",
            "-3",
            "u",
            "4",
            "-3",
            "k",
            "4",
            "-2",
            "a",
            "4",
            "-2",
            "sh",
            "4",
            "-1",
            "i",
            "4",
            "-1",
            "w",
            "4",
            "0",
            "a",
            "4",
            "0",
            "o",
            "3",
            "-2",
            "r",
            "3",
            "-1",
            "e",
            "3",
            "-1",
            "m",
            "3",
            "0",
            "o",
            "3",
            "0",
            "w",
            "2",
            "-1",
            "a",
            "2",
            "-1",
            "k",
            "2",
            "0",
            "a",
            "2",
            "0",
            "k",
            "2",
            "1",
            "a",
            "2",
            "1",
            "cl",
            "2",
            "2",
            "t",
            "2",
            "3",
            "a",
            "2",
            "3",
        ]
    elif phoneme_tokenizer.g2p_type == "pyopenjtalk_kana":
        input = "昔は俺も若かった"
        output = ["ム", "カ", "シ", "ワ", "オ", "レ", "モ", "ワ", "カ", "カ", "ッ", "タ"]
    elif phoneme_tokenizer.g2p_type == "pypinyin_g2p":
        input = "卡尔普陪外孙玩滑梯。"
        output = [
            "ka3",
            "er3",
            "pu3",
            "pei2",
            "wai4",
            "sun1",
            "wan2",
            "hua2",
            "ti1",
            "。",
        ]
    elif phoneme_tokenizer.g2p_type == "pypinyin_g2p_phone":
        input = "卡尔普陪外孙玩滑梯。"
        output = [
            "k",
            "a3",
            "er3",
            "p",
            "u3",
            "p",
            "ei2",
            "uai4",
            "s",
            "un1",
            "uan2",
            "h",
            "ua2",
            "t",
            "i1",
            "。",
        ]
    else:
        raise NotImplementedError
    assert phoneme_tokenizer.text2tokens(input) == output
Esempio n. 8
0
def test_text2tokens(phoneme_tokenizer: PhonemeTokenizer):
    if phoneme_tokenizer.g2p_type is None:
        input = "HH AH0 L OW1 <space> W ER1 L D"
        output = ["HH", "AH0", "L", "OW1", "<space>", "W", "ER1", "L", "D"]
    elif phoneme_tokenizer.g2p_type == "g2p_en":
        input = "Hello World"
        output = ["HH", "AH0", "L", "OW1", " ", "W", "ER1", "L", "D"]
    elif phoneme_tokenizer.g2p_type == "g2p_en_no_space":
        input = "Hello World"
        output = ["HH", "AH0", "L", "OW1", "W", "ER1", "L", "D"]
    elif phoneme_tokenizer.g2p_type == "pyopenjtalk":
        input = "昔は、俺も若かった"
        output = [
            "m",
            "u",
            "k",
            "a",
            "sh",
            "i",
            "w",
            "a",
            "pau",
            "o",
            "r",
            "e",
            "m",
            "o",
            "w",
            "a",
            "k",
            "a",
            "k",
            "a",
            "cl",
            "t",
            "a",
        ]
    elif phoneme_tokenizer.g2p_type == "pyopenjtalk_kana":
        input = "昔は、俺も若かった"
        output = [
            "ム", "カ", "シ", "ワ", "、", "オ", "レ", "モ", "ワ", "カ", "カ", "ッ", "タ"
        ]
    elif phoneme_tokenizer.g2p_type == "pyopenjtalk_accent":
        input = "昔は、俺も若かった"
        output = [
            "m",
            "4",
            "-3",
            "u",
            "4",
            "-3",
            "k",
            "4",
            "-2",
            "a",
            "4",
            "-2",
            "sh",
            "4",
            "-1",
            "i",
            "4",
            "-1",
            "w",
            "4",
            "0",
            "a",
            "4",
            "0",
            "o",
            "3",
            "-2",
            "r",
            "3",
            "-1",
            "e",
            "3",
            "-1",
            "m",
            "3",
            "0",
            "o",
            "3",
            "0",
            "w",
            "2",
            "-1",
            "a",
            "2",
            "-1",
            "k",
            "2",
            "0",
            "a",
            "2",
            "0",
            "k",
            "2",
            "1",
            "a",
            "2",
            "1",
            "cl",
            "2",
            "2",
            "t",
            "2",
            "3",
            "a",
            "2",
            "3",
        ]
    elif phoneme_tokenizer.g2p_type == "pyopenjtalk_accent_with_pause":
        input = "昔は、俺も若かった"
        output = [
            "m",
            "4",
            "-3",
            "u",
            "4",
            "-3",
            "k",
            "4",
            "-2",
            "a",
            "4",
            "-2",
            "sh",
            "4",
            "-1",
            "i",
            "4",
            "-1",
            "w",
            "4",
            "0",
            "a",
            "4",
            "0",
            "pau",
            "o",
            "3",
            "-2",
            "r",
            "3",
            "-1",
            "e",
            "3",
            "-1",
            "m",
            "3",
            "0",
            "o",
            "3",
            "0",
            "w",
            "2",
            "-1",
            "a",
            "2",
            "-1",
            "k",
            "2",
            "0",
            "a",
            "2",
            "0",
            "k",
            "2",
            "1",
            "a",
            "2",
            "1",
            "cl",
            "2",
            "2",
            "t",
            "2",
            "3",
            "a",
            "2",
            "3",
        ]
    elif phoneme_tokenizer.g2p_type == "pypinyin_g2p":
        input = "卡尔普陪外孙玩滑梯。"
        output = [
            "ka3",
            "er3",
            "pu3",
            "pei2",
            "wai4",
            "sun1",
            "wan2",
            "hua2",
            "ti1",
            "。",
        ]
    elif phoneme_tokenizer.g2p_type == "pypinyin_g2p_phone":
        input = "卡尔普陪外孙玩滑梯。"
        output = [
            "k",
            "a3",
            "er3",
            "p",
            "u3",
            "p",
            "ei2",
            "uai4",
            "s",
            "un1",
            "uan2",
            "h",
            "ua2",
            "t",
            "i1",
            "。",
        ]
    elif phoneme_tokenizer.g2p_type == "espeak_ng_arabic":
        input = u"السلام عليكم"
        output = [
            "ʔ", "a", "s", "s", "ˈa", "l", "aː", "m", "ʕ", "l", "ˈiː", "k", "m"
        ]
    elif phoneme_tokenizer.g2p_type == "espeak_ng_german":
        input = "Das hört sich gut an."
        output = [
            "d",
            "a",
            "s",
            "h",
            "ˈœ",
            "ɾ",
            "t",
            "z",
            "ɪ",
            "ç",
            "ɡ",
            "ˈuː",
            "t",
            "ˈa",
            "n",
            ".",
        ]
    elif phoneme_tokenizer.g2p_type == "espeak_ng_french":
        input = "Bonjour le monde."
        output = ["b", "ɔ̃", "ʒ", "ˈu", "ʁ", "l", "ə-", "m", "ˈɔ̃", "d", "."]
    elif phoneme_tokenizer.g2p_type == "espeak_ng_spanish":
        input = "Hola Mundo."
        output = ["ˈo", "l", "a", "m", "ˈu", "n", "d", "o", "."]
    elif phoneme_tokenizer.g2p_type == "espeak_ng_russian":
        input = "Привет мир."
        output = ["p", "rʲ", "i", "vʲ", "ˈe", "t", "mʲ", "ˈi", "r", "."]
    else:
        raise NotImplementedError
    assert phoneme_tokenizer.text2tokens(input) == output