def do_phonemize(args): """Generate phonemes for text using config""" from TTS.utils.io import load_config from TTS.tts.utils.text import make_symbols, phoneme_to_sequence c = load_config(args.config) _, phonemes = make_symbols(**c.characters) if args.text: # Use arguments texts = args.text else: # Use stdin texts = sys.stdin if os.isatty(sys.stdin.fileno()): print("Reading text from stdin...", file=sys.stderr) for line in texts: line = line.strip() if not line: continue line_indexes = phoneme_to_sequence( line, [c.text_cleaner], language=c.phoneme_language, enable_eos_bos=False, tp=c.characters if "characters" in c.keys() else None, backend=c.phoneme_backend, ) line_phonemes = [phonemes[i] for i in line_indexes] print(args.separator.join(line_phonemes))
def get_characters(config: Coqpit) -> str: # TODO: implement CharacterProcessor if config.characters is not None: symbols, phonemes = make_symbols(**config.characters) else: from TTS.tts.utils.text.symbols import parse_symbols, phonemes, symbols config.characters = CharactersConfig(**parse_symbols()) model_characters = phonemes if config.use_phonemes else symbols num_chars = len(model_characters) + getattr(config, "add_blank", False) return model_characters, config, num_chars
def get_characters(config: Coqpit) -> str: # TODO: implement CharacterProcessor if config.characters is not None: symbols, phonemes = make_symbols(**config.characters) else: from TTS.tts.utils.text.symbols import ( # pylint: disable=import-outside-toplevel parse_symbols, phonemes, symbols, ) config.characters = parse_symbols() model_characters = phonemes if config.use_phonemes else symbols return model_characters, config
def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -> None: """Load the TTS model. Args: tts_checkpoint (str): path to the model checkpoint. tts_config_path (str): path to the model config file. use_cuda (bool): enable/disable CUDA use. """ # pylint: disable=global-statement global symbols, phonemes self.tts_config = load_config(tts_config_path) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(verbose=False, **self.tts_config.audio) if "characters" in self.tts_config.keys(): symbols, phonemes = make_symbols(**self.tts_config.characters) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) if self.tts_config.use_speaker_embedding is True: self.tts_speakers_file = ( self.tts_speakers_file if self.tts_speakers_file else self.tts_config["external_speaker_embedding_file"]) self._load_speakers(self.tts_speakers_file) self.tts_model = setup_model( self.input_size, num_speakers=self.num_speakers, c=self.tts_config, speaker_embedding_dim=self.speaker_embedding_dim, ) self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) if use_cuda: self.tts_model.cuda()
def load_tts(self, tts_checkpoint, tts_config, use_cuda): # pylint: disable=global-statement global symbols, phonemes self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if 'characters' in self.tts_config.keys(): symbols, phonemes = make_symbols(**self.tts_config.characters) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) self.tts_model = setup_model(self.input_size, num_speakers=self.num_speakers, c=self.tts_config) self.tts_model.load_checkpoint(tts_config, tts_checkpoint, eval=True) if use_cuda: self.tts_model.cuda()
def load_tts(self, tts_checkpoint, tts_config, use_cuda): # pylint: disable=global-statement global symbols, phonemes print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > checkpoint file: ", tts_checkpoint) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if 'characters' in self.tts_config.keys(): symbols, phonemes = make_symbols(**self.tts_config.characters) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # TODO: fix this for multi-speaker model - load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping(self.config.tts_speakers) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(tts_checkpoint, map_location=torch.device('cpu')) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp: self.tts_model.decoder.set_r(cp['r']) print(f" > model reduction factor: {cp['r']}")
def do_verify_phonemes(args): """Verify that phoneme cache matches what gruut would produce""" import numpy as np from TTS.utils.io import load_config from TTS.tts.utils.text import make_symbols _LOGGER.debug("Loading gruut language %s", args.language) gruut_lang = gruut.Language.load( Path("~/.config/rhasspy/profiles/de/tts/larynx/de/thorsten/gruut/de-de" ).expanduser(), args.language, ) assert gruut_lang, f"Unsupported language: {gruut_lang}" # Load config c = load_config(args.config) output_path = Path(c.output_path) phoneme_cache_dir = Path(c.phoneme_cache_path) _, phonemes = make_symbols(**c.characters) # Offset for pad phoneme_to_id = {p: (i + 1) for i, p in enumerate(phonemes)} # Add pad phoneme_to_id["_"] = 0 # Include or exclude word break symbol (#) word_breaks = c.get("characters", {}).get("word_breaks", True) # Load lexicon and missing words lexicon = gruut_lang.phonemizer.lexicon missing_words_path = output_path / "missing_words.txt" if missing_words_path.is_file(): _LOGGER.debug("Loading missing words from %s", missing_words_path) with open(missing_words_path, "r") as missing_words_file: gruut.utils.load_lexicon(missing_words_file, lexicon=lexicon) # Load metadata id_to_text = {} for ds in c.datasets: metadata_path = Path(ds["path"]) / ds["meta_file_train"] with open(metadata_path, "r") as metadata_file: for line in metadata_file: line = line.strip() if line: item_id, item_text = line.split("|", maxsplit=1) id_to_text[item_id] = item_text id_to_phonemes = {} for phoneme_path in phoneme_cache_dir.glob("*.npy"): item_id = re.sub("_phoneme$", "", phoneme_path.stem) _LOGGER.debug("Processing %s (id=%s)", phoneme_path, item_id) sequence = np.load(phoneme_path, allow_pickle=True) actual_phonemes = [phonemes[index] for index in sequence] expected_phonemes = id_to_phonemes.get(item_id) if not expected_phonemes: # Compute expected phonmemes expected_phonemes = [] item_text = id_to_text[item_id] for sentence in gruut_lang.tokenizer.tokenize(item_text): # Choose first pronunciation for each word word_phonemes = [ wp[0] for wp in gruut_lang.phonemizer.phonemize( sentence.clean_words, word_indexes=True, word_breaks=word_breaks, separate_tones=None, ) if wp ] expected_phonemes.extend(p for ps in word_phonemes for p in ps) # Associate with item id id_to_phonemes[item_id] = expected_phonemes assert ( actual_phonemes == expected_phonemes ), f"Got {actual_phonemes}, expected {expected_phonemes} for '{item_text}'" print(item_id, "OK")
def __init__(self, text, expected_output_audio_format, file_name): # set a pysbd segmenter to be used later to divide the input into segments self.seg = pysbd.Segmenter(language="en", clean=True) # runtime settings use_cuda = False # model paths - models and config files are taken from Mozilla TTS's github page TTS_MODEL = "/path/to/checkpoint_130000.pth.tar" TTS_CONFIG = "server/config/config.json" VOCODER_MODEL = "/path/to/checkpoint_1450000.pth.tar" VOCODER_CONFIG = "server/config/config_vocoder.json" # load configs TTS_CONFIG = load_config(TTS_CONFIG) self.TTS_CONFIG = TTS_CONFIG # set it as a class variable to be later used by convert_audio_to() VOCODER_CONFIG = load_config(VOCODER_CONFIG) # load the audio processor ap = AudioProcessor(**TTS_CONFIG.audio) # LOAD TTS MODEL # multi speaker self.speaker_id = None self.speakers = [] # use the imported symbols and phonemes global symbols, phonemes use_phonemes = TTS_CONFIG.use_phonemes if 'characters' in TTS_CONFIG.keys(): symbols, phonemes = make_symbols(**TTS_CONFIG.characters) if use_phonemes: num_chars = len(phonemes) else: num_chars = len(symbols) # load the model model = setup_model(num_chars, len(self.speakers), TTS_CONFIG) # load model state cp = torch.load(TTS_MODEL, map_location=torch.device('cpu')) # load the model model.load_state_dict(cp['model']) if use_cuda: model.cuda() model.eval() model.decoder.max_decoder_steps = 3000 # set model stepsize if 'r' in cp: model.decoder.set_r(cp['r']) # # LOAD VOCODER MODEL self.vocoder_model = setup_generator(VOCODER_CONFIG) self.vocoder_model.load_state_dict( torch.load(VOCODER_MODEL, map_location="cpu")["model"]) self.vocoder_model.remove_weight_norm() self.vocoder_model.inference_padding = 0 # ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) if use_cuda: self.vocoder_model.cuda() self.vocoder_model.eval() # TODO: need to train a model? wav = self.tts(model, text, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True) print(len(wav.tobytes())) # save the generated .wav file as (file_name + "_audio.wav") wavfile.write(file_name + "_audio.wav", TTS_CONFIG.audio["sample_rate"], wav) # convert the generated audio file to the specifed audio format self.convert_audio_to(expected_output_audio_format, file_name)