def main(): """Run phoneme conversion.""" parser = argparse.ArgumentParser() parser.add_argument("--g2p", type=str, required=True, help="G2P type.") parser.add_argument("--cleaner", type=str, default=None, help="Cleaner type.") parser.add_argument("--nj", type=int, default=4, help="Number of parallel jobs.") parser.add_argument("in_text", type=str, help="Input kaldi-style text.") parser.add_argument("out_text", type=str, help="Output kaldi-style text.") args = parser.parse_args() phoneme_tokenizer = PhonemeTokenizer(args.g2p) cleaner = None if args.cleaner is not None: cleaner = TextCleaner(args.cleaner) with codecs.open(args.in_text, encoding="utf8") as f: lines = [line.strip() for line in f.readlines()] text = {line.split()[0]: " ".join(line.split()[1:]) for line in lines} if cleaner is not None: text = {k: cleaner(v) for k, v in text.items()} phns_list = Parallel(n_jobs=args.nj)([ delayed(phoneme_tokenizer.text2tokens)(sentence) for sentence in text.values() ]) with codecs.open(args.out_text, "w", encoding="utf8") as g: for utt_id, phns in zip(text.keys(), phns_list): g.write(f"{utt_id} " + " ".join(phns) + "\n")
def build_tokenizer( token_type: str, bpemodel: Union[Path, str, Iterable[str]] = None, non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, remove_non_linguistic_symbols: bool = False, space_symbol: str = "<space>", delimiter: str = None, g2p_type: str = None, ) -> AbsTokenizer: """A helper function to instantiate Tokenizer""" assert check_argument_types() if token_type == "bpe": if bpemodel is None: raise ValueError('bpemodel is required if token_type = "bpe"') if remove_non_linguistic_symbols: raise RuntimeError( "remove_non_linguistic_symbols is not implemented for token_type=bpe" ) return SentencepiecesTokenizer(bpemodel) elif token_type == "word": if remove_non_linguistic_symbols and non_linguistic_symbols is not None: return WordTokenizer( delimiter=delimiter, non_linguistic_symbols=non_linguistic_symbols, remove_non_linguistic_symbols=True, ) else: return WordTokenizer(delimiter=delimiter) elif token_type == "char": return CharTokenizer( non_linguistic_symbols=non_linguistic_symbols, space_symbol=space_symbol, remove_non_linguistic_symbols=remove_non_linguistic_symbols, ) elif token_type == "phn": if g2p_type is None: raise ValueError("g2p_type is required if token_type=phn") return PhonemeTokenizer( g2p_type=g2p_type, non_linguistic_symbols=non_linguistic_symbols, space_symbol=space_symbol, remove_non_linguistic_symbols=remove_non_linguistic_symbols, ) else: raise ValueError(f"token_mode must be one of bpe, word, char or phn: " f"{token_type}")
def test_text2tokens(phoneme_tokenizer: PhonemeTokenizer): if phoneme_tokenizer.g2p_type == "g2p_en": input = "Hello World" output = ["HH", "AH0", "L", "OW1", " ", "W", "ER1", "L", "D"] elif phoneme_tokenizer.g2p_type == "pyopenjtalk": input = "昔は俺も若かった" output = [ "m", "u", "k", "a", "sh", "i", "w", "a", "o", "r", "e", "m", "o", "w", "a", "k", "a", "k", "a", "cl", "t", "a", ] elif phoneme_tokenizer.g2p_type == "pyopenjtalk_kana": input = "昔は俺も若かった" output = ["ム", "カ", "シ", "ワ", "オ", "レ", "モ", "ワ", "カ", "カ", "ッ", "タ"] else: raise NotImplementedError assert phoneme_tokenizer.text2tokens(input) == output
def test_text2tokens(phoneme_tokenizer: PhonemeTokenizer): if phoneme_tokenizer.g2p_type is None: input = "HH AH0 L OW1 W ER1 L D" output = ["HH", "AH0", "L", "OW1", " ", "W", "ER1", "L", "D"] elif phoneme_tokenizer.g2p_type == "g2p_en": input = "Hello World" output = ["HH", "AH0", "L", "OW1", " ", "W", "ER1", "L", "D"] elif phoneme_tokenizer.g2p_type == "g2p_en_no_space": input = "Hello World" output = ["HH", "AH0", "L", "OW1", "W", "ER1", "L", "D"] elif phoneme_tokenizer.g2p_type == "pyopenjtalk": input = "昔は、俺も若かった" output = [ "m", "u", "k", "a", "sh", "i", "w", "a", "pau", "o", "r", "e", "m", "o", "w", "a", "k", "a", "k", "a", "cl", "t", "a", ] elif phoneme_tokenizer.g2p_type == "pyopenjtalk_kana": input = "昔は、俺も若かった" output = [ "ム", "カ", "シ", "ワ", "、", "オ", "レ", "モ", "ワ", "カ", "カ", "ッ", "タ" ] elif phoneme_tokenizer.g2p_type == "pyopenjtalk_accent": input = "昔は、俺も若かった" output = [ "m", "4", "-3", "u", "4", "-3", "k", "4", "-2", "a", "4", "-2", "sh", "4", "-1", "i", "4", "-1", "w", "4", "0", "a", "4", "0", "o", "3", "-2", "r", "3", "-1", "e", "3", "-1", "m", "3", "0", "o", "3", "0", "w", "2", "-1", "a", "2", "-1", "k", "2", "0", "a", "2", "0", "k", "2", "1", "a", "2", "1", "cl", "2", "2", "t", "2", "3", "a", "2", "3", ] elif phoneme_tokenizer.g2p_type == "pyopenjtalk_accent_with_pause": input = "昔は、俺も若かった" output = [ "m", "4", "-3", "u", "4", "-3", "k", "4", "-2", "a", "4", "-2", "sh", "4", "-1", "i", "4", "-1", "w", "4", "0", "a", "4", "0", "pau", "o", "3", "-2", "r", "3", "-1", "e", "3", "-1", "m", "3", "0", "o", "3", "0", "w", "2", "-1", "a", "2", "-1", "k", "2", "0", "a", "2", "0", "k", "2", "1", "a", "2", "1", "cl", "2", "2", "t", "2", "3", "a", "2", "3", ] elif phoneme_tokenizer.g2p_type == "pyopenjtalk_prosody": input = "昔は、俺も若かった" output = [ "^", "m", "u", "[", "k", "a", "sh", "i", "w", "a", "_", "o", "[", "r", "e", "m", "o", "#", "w", "a", "[", "k", "a", "]", "k", "a", "cl", "t", "a", "$", ] elif phoneme_tokenizer.g2p_type == "pypinyin_g2p": input = "卡尔普陪外孙玩滑梯。" output = [ "ka3", "er3", "pu3", "pei2", "wai4", "sun1", "wan2", "hua2", "ti1", "。", ] elif phoneme_tokenizer.g2p_type == "pypinyin_g2p_phone": input = "卡尔普陪外孙玩滑梯。" output = [ "k", "a3", "er3", "p", "u3", "p", "ei2", "uai4", "s", "uen1", "uan2", "h", "ua2", "t", "i1", "。", ] elif phoneme_tokenizer.g2p_type == "espeak_ng_arabic": input = "السلام عليكم" output = [ "ʔ", "a", "s", "s", "a", "l", "ˈaː", "m", "ʕ", "l", "ˈiː", "k", "m" ] elif phoneme_tokenizer.g2p_type == "espeak_ng_german": input = "Das hört sich gut an." output = [ "d", "a", "s", "h", "ˈœ", "ɾ", "t", "z", "ɪ", "ç", "ɡ", "ˈuː", "t", "ˈa", "n", ".", ] elif phoneme_tokenizer.g2p_type == "espeak_ng_french": input = "Bonjour le monde." output = ["b", "ɔ̃", "ʒ", "ˈu", "ʁ", "l", "ə-", "m", "ˈɔ̃", "d", "."] elif phoneme_tokenizer.g2p_type == "espeak_ng_spanish": input = "Hola Mundo." output = ["ˈo", "l", "a", "m", "ˈu", "n", "d", "o", "."] elif phoneme_tokenizer.g2p_type == "espeak_ng_russian": input = "Привет мир." output = ["p", "rʲ", "i", "vʲ", "ˈe", "t", "mʲ", "ˈi", "r", "."] elif phoneme_tokenizer.g2p_type == "espeak_ng_greek": input = "Γειά σου Κόσμε." output = ["j", "ˈa", "s", "u", "k", "ˈo", "s", "m", "e", "."] elif phoneme_tokenizer.g2p_type == "espeak_ng_finnish": input = "Hei maailma." output = ["h", "ˈei", "m", "ˈaː", "ɪ", "l", "m", "a", "."] elif phoneme_tokenizer.g2p_type == "espeak_ng_hungarian": input = "Helló Világ." output = ["h", "ˈɛ", "l", "l", "oː", "v", "ˈi", "l", "aː", "ɡ", "."] elif phoneme_tokenizer.g2p_type == "espeak_ng_dutch": input = "Hallo Wereld." output = ["h", "ˈɑ", "l", "oː", "ʋ", "ˈɪː", "r", "ə", "l", "t", "."] elif phoneme_tokenizer.g2p_type == "espeak_ng_hindi": input = "नमस्ते दुनिया" output = [ "n", "ə", "m", "ˈʌ", "s", "t", "eː", "d", "ˈʊ", "n", "ɪ", "j", "ˌaː" ] elif phoneme_tokenizer.g2p_type == "g2pk": input = "안녕하세요 세계입니다." output = [ "ᄋ", "ᅡ", "ᆫ", "ᄂ", "ᅧ", "ᆼ", "ᄒ", "ᅡ", "ᄉ", "ᅦ", "ᄋ", "ᅭ", " ", "ᄉ", "ᅦ", "ᄀ", "ᅨ", "ᄋ", "ᅵ", "ᆷ", "ᄂ", "ᅵ", "ᄃ", "ᅡ", ".", ] elif phoneme_tokenizer.g2p_type == "g2pk_no_space": input = "안녕하세요 세계입니다." output = [ "ᄋ", "ᅡ", "ᆫ", "ᄂ", "ᅧ", "ᆼ", "ᄒ", "ᅡ", "ᄉ", "ᅦ", "ᄋ", "ᅭ", "ᄉ", "ᅦ", "ᄀ", "ᅨ", "ᄋ", "ᅵ", "ᆷ", "ᄂ", "ᅵ", "ᄃ", "ᅡ", ".", ] elif phoneme_tokenizer.g2p_type == "espeak_ng_english_us_vits": input = "Hello, World." output = [ "h", "ə", "l", "ˈ", "o", "ʊ", ",", "<space>", "w", "ˈ", "ɜ", "ː", "l", "d", ".", ] elif phoneme_tokenizer.g2p_type == "korean_jaso": input = "나는 학교에 갑니다." output = [ "ᄂ", "ᅡ", "ᄂ", "ᅳ", "ᆫ", "<space>", "ᄒ", "ᅡ", "ᆨ", "ᄀ", "ᅭ", "ᄋ", "ᅦ", "<space>", "ᄀ", "ᅡ", "ᆸ", "ᄂ", "ᅵ", "ᄃ", "ᅡ", ".", ] elif phoneme_tokenizer.g2p_type == "korean_jaso_no_space": input = "나는 학교에 갑니다." output = [ "ᄂ", "ᅡ", "ᄂ", "ᅳ", "ᆫ", "ᄒ", "ᅡ", "ᆨ", "ᄀ", "ᅭ", "ᄋ", "ᅦ", "ᄀ", "ᅡ", "ᆸ", "ᄂ", "ᅵ", "ᄃ", "ᅡ", ".", ] elif phoneme_tokenizer.g2p_type == "is_g2p": input = "hlaupa í burtu í dag" output = [ "l_0", "9i:", ".", "p", "a", ",", "i:", ",", "p", "Y", "r_0", ".", "t", "Y", ",", "i:", ",", "t", "a:", "G", ] else: raise NotImplementedError assert phoneme_tokenizer.text2tokens(input) == output
def phoneme_tokenizer(request): return PhonemeTokenizer(g2p_type=request.param)
def test_token2text(phoneme_tokenizer: PhonemeTokenizer): assert phoneme_tokenizer.tokens2text(["a", "b", "c"]) == "abc"
def test_text2tokens(phoneme_tokenizer: PhonemeTokenizer): if phoneme_tokenizer.g2p_type is None: input = "HH AH0 L OW1 <space> W ER1 L D" output = ["HH", "AH0", "L", "OW1", "<space>", "W", "ER1", "L", "D"] elif phoneme_tokenizer.g2p_type == "g2p_en": input = "Hello World" output = ["HH", "AH0", "L", "OW1", " ", "W", "ER1", "L", "D"] elif phoneme_tokenizer.g2p_type == "g2p_en_no_space": input = "Hello World" output = ["HH", "AH0", "L", "OW1", "W", "ER1", "L", "D"] elif phoneme_tokenizer.g2p_type == "pyopenjtalk": input = "昔は俺も若かった" output = [ "m", "u", "k", "a", "sh", "i", "w", "a", "o", "r", "e", "m", "o", "w", "a", "k", "a", "k", "a", "cl", "t", "a", ] elif phoneme_tokenizer.g2p_type == "pyopenjtalk_accent": input = "昔は俺も若かった" output = [ "m", "4", "-3", "u", "4", "-3", "k", "4", "-2", "a", "4", "-2", "sh", "4", "-1", "i", "4", "-1", "w", "4", "0", "a", "4", "0", "o", "3", "-2", "r", "3", "-1", "e", "3", "-1", "m", "3", "0", "o", "3", "0", "w", "2", "-1", "a", "2", "-1", "k", "2", "0", "a", "2", "0", "k", "2", "1", "a", "2", "1", "cl", "2", "2", "t", "2", "3", "a", "2", "3", ] elif phoneme_tokenizer.g2p_type == "pyopenjtalk_kana": input = "昔は俺も若かった" output = ["ム", "カ", "シ", "ワ", "オ", "レ", "モ", "ワ", "カ", "カ", "ッ", "タ"] elif phoneme_tokenizer.g2p_type == "pypinyin_g2p": input = "卡尔普陪外孙玩滑梯。" output = [ "ka3", "er3", "pu3", "pei2", "wai4", "sun1", "wan2", "hua2", "ti1", "。", ] elif phoneme_tokenizer.g2p_type == "pypinyin_g2p_phone": input = "卡尔普陪外孙玩滑梯。" output = [ "k", "a3", "er3", "p", "u3", "p", "ei2", "uai4", "s", "un1", "uan2", "h", "ua2", "t", "i1", "。", ] else: raise NotImplementedError assert phoneme_tokenizer.text2tokens(input) == output
def test_text2tokens(phoneme_tokenizer: PhonemeTokenizer): if phoneme_tokenizer.g2p_type is None: input = "HH AH0 L OW1 <space> W ER1 L D" output = ["HH", "AH0", "L", "OW1", "<space>", "W", "ER1", "L", "D"] elif phoneme_tokenizer.g2p_type == "g2p_en": input = "Hello World" output = ["HH", "AH0", "L", "OW1", " ", "W", "ER1", "L", "D"] elif phoneme_tokenizer.g2p_type == "g2p_en_no_space": input = "Hello World" output = ["HH", "AH0", "L", "OW1", "W", "ER1", "L", "D"] elif phoneme_tokenizer.g2p_type == "pyopenjtalk": input = "昔は、俺も若かった" output = [ "m", "u", "k", "a", "sh", "i", "w", "a", "pau", "o", "r", "e", "m", "o", "w", "a", "k", "a", "k", "a", "cl", "t", "a", ] elif phoneme_tokenizer.g2p_type == "pyopenjtalk_kana": input = "昔は、俺も若かった" output = [ "ム", "カ", "シ", "ワ", "、", "オ", "レ", "モ", "ワ", "カ", "カ", "ッ", "タ" ] elif phoneme_tokenizer.g2p_type == "pyopenjtalk_accent": input = "昔は、俺も若かった" output = [ "m", "4", "-3", "u", "4", "-3", "k", "4", "-2", "a", "4", "-2", "sh", "4", "-1", "i", "4", "-1", "w", "4", "0", "a", "4", "0", "o", "3", "-2", "r", "3", "-1", "e", "3", "-1", "m", "3", "0", "o", "3", "0", "w", "2", "-1", "a", "2", "-1", "k", "2", "0", "a", "2", "0", "k", "2", "1", "a", "2", "1", "cl", "2", "2", "t", "2", "3", "a", "2", "3", ] elif phoneme_tokenizer.g2p_type == "pyopenjtalk_accent_with_pause": input = "昔は、俺も若かった" output = [ "m", "4", "-3", "u", "4", "-3", "k", "4", "-2", "a", "4", "-2", "sh", "4", "-1", "i", "4", "-1", "w", "4", "0", "a", "4", "0", "pau", "o", "3", "-2", "r", "3", "-1", "e", "3", "-1", "m", "3", "0", "o", "3", "0", "w", "2", "-1", "a", "2", "-1", "k", "2", "0", "a", "2", "0", "k", "2", "1", "a", "2", "1", "cl", "2", "2", "t", "2", "3", "a", "2", "3", ] elif phoneme_tokenizer.g2p_type == "pypinyin_g2p": input = "卡尔普陪外孙玩滑梯。" output = [ "ka3", "er3", "pu3", "pei2", "wai4", "sun1", "wan2", "hua2", "ti1", "。", ] elif phoneme_tokenizer.g2p_type == "pypinyin_g2p_phone": input = "卡尔普陪外孙玩滑梯。" output = [ "k", "a3", "er3", "p", "u3", "p", "ei2", "uai4", "s", "un1", "uan2", "h", "ua2", "t", "i1", "。", ] elif phoneme_tokenizer.g2p_type == "espeak_ng_arabic": input = u"السلام عليكم" output = [ "ʔ", "a", "s", "s", "ˈa", "l", "aː", "m", "ʕ", "l", "ˈiː", "k", "m" ] elif phoneme_tokenizer.g2p_type == "espeak_ng_german": input = "Das hört sich gut an." output = [ "d", "a", "s", "h", "ˈœ", "ɾ", "t", "z", "ɪ", "ç", "ɡ", "ˈuː", "t", "ˈa", "n", ".", ] elif phoneme_tokenizer.g2p_type == "espeak_ng_french": input = "Bonjour le monde." output = ["b", "ɔ̃", "ʒ", "ˈu", "ʁ", "l", "ə-", "m", "ˈɔ̃", "d", "."] elif phoneme_tokenizer.g2p_type == "espeak_ng_spanish": input = "Hola Mundo." output = ["ˈo", "l", "a", "m", "ˈu", "n", "d", "o", "."] elif phoneme_tokenizer.g2p_type == "espeak_ng_russian": input = "Привет мир." output = ["p", "rʲ", "i", "vʲ", "ˈe", "t", "mʲ", "ˈi", "r", "."] else: raise NotImplementedError assert phoneme_tokenizer.text2tokens(input) == output