def tts(self, text, speaker_idx=None): start_time = time.time() wavs = [] sens = self.split_into_sentences(text) print(" > Text splitted to sentences.") print(sens) speaker_embedding = self.init_speaker(speaker_idx) use_gl = self.vocoder_model is None for sen in sens: # synthesize voice waveform, _, _, mel_postnet_spec, _, _ = synthesis( self.tts_model, sen, self.tts_config, self.use_cuda, self.ap, speaker_idx, None, False, self.tts_config.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding) if not use_gl: # denormalize tts output based on tts audio config mel_postnet_spec = self.ap.denormalize(mel_postnet_spec.T).T device_type = "cuda" if self.use_cuda else "cpu" # renormalize spectrogram based on vocoder config vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T) # compute scale factor for possible sample rate mismatch scale_factor = [1, self.vocoder_config['audio']['sample_rate'] / self.ap.sample_rate] if scale_factor[1] != 1: print(" > interpolating tts model output.") vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input) else: vocoder_input = torch.tensor(vocoder_input).unsqueeze(0) # pylint: disable=not-callable # run vocoder model # [1, T, C] waveform = self.vocoder_model.inference(vocoder_input.to(device_type)) if self.use_cuda and not use_gl: waveform = waveform.cpu() if not use_gl: waveform = waveform.numpy() waveform = waveform.squeeze() # trim silence waveform = trim_silence(waveform, self.ap) wavs += list(waveform) wavs += [0] * 10000 # compute stats process_time = time.time() - start_time audio_time = len(wavs) / self.tts_config.audio['sample_rate'] print(f" > Processing time: {process_time}") print(f" > Real-time factor: {process_time / audio_time}") return wavs
def tts(self, text: str, speaker_idx: str = "", speaker_wav=None, style_wav=None) -> List[int]: """🐸 TTS magic. Run all the models and generate speech. Args: text (str): input text. speaker_idx (str, optional): spekaer id for multi-speaker models. Defaults to "". speaker_wav (): style_wav ([type], optional): style waveform for GST. Defaults to None. Returns: List[int]: [description] """ start_time = time.time() wavs = [] sens = self.split_into_sentences(text) print(" > Text splitted to sentences.") print(sens) # handle multi-speaker speaker_embedding = None speaker_id = None if isinstance(speaker_idx, int): speaker_id = speaker_idx elif self.tts_speakers_file: if speaker_idx and isinstance(speaker_idx, str): if self.tts_config.use_d_vector_file: # get the speaker embedding from the saved d_vectors. speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker( speaker_idx)[0] else: # get speaker idx from the speaker name try: speaker_id = self.tts_model.speaker_manager.speaker_ids[ speaker_idx] except KeyError: # Interpet as int speaker_id = int(speaker_idx) elif not speaker_idx and not speaker_wav: raise ValueError( " [!] Look like you use a multi-speaker model. " "You need to define either a `speaker_idx` or a `style_wav` to use a multi-speaker model." ) else: speaker_embedding = None else: if speaker_idx: raise ValueError( f" [!] Missing speaker.json file path for selecting speaker {speaker_idx}." "Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. " ) # compute a new d_vector from the given clip. if speaker_wav is not None: speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip( speaker_wav) use_gl = self.vocoder_model is None for sen in sens: # synthesize voice outputs = synthesis( model=self.tts_model, text=sen, CONFIG=self.tts_config, use_cuda=self.use_cuda, ap=self.ap, speaker_id=speaker_id, style_wav=style_wav, enable_eos_bos_chars=self.tts_config.enable_eos_bos_chars, use_griffin_lim=use_gl, d_vector=speaker_embedding, ) waveform = outputs["wav"] mel_postnet_spec = ( outputs["outputs"]["model_outputs"][0].detach().cpu().numpy()) if not use_gl: # denormalize tts output based on tts audio config mel_postnet_spec = self.ap.denormalize(mel_postnet_spec.T).T device_type = "cuda" if self.use_cuda else "cpu" # renormalize spectrogram based on vocoder config vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T) # compute scale factor for possible sample rate mismatch scale_factor = [ 1, self.vocoder_config["audio"]["sample_rate"] / self.ap.sample_rate, ] if scale_factor[1] != 1: print(" > interpolating tts model output.") vocoder_input = interpolate_vocoder_input( scale_factor, vocoder_input) else: vocoder_input = torch.tensor(vocoder_input).unsqueeze(0) # pylint: disable=not-callable # run vocoder model # [1, T, C] waveform = self.vocoder_model.inference( vocoder_input.to(device_type)) if self.use_cuda and not use_gl: waveform = waveform.cpu() if not use_gl: waveform = waveform.numpy() waveform = waveform.squeeze() # trim silence waveform = trim_silence(waveform, self.ap) wavs += list(waveform) wavs += [0] * 10000 # compute stats process_time = time.time() - start_time audio_time = len(wavs) / self.tts_config.audio["sample_rate"] print(f" > Processing time: {process_time}") print(f" > Real-time factor: {process_time / audio_time}") return wavs
def tts( self, text: str = "", speaker_name: str = "", language_name: str = "", speaker_wav: Union[str, List[str]] = None, style_wav=None, reference_wav=None, reference_speaker_name=None, ) -> List[int]: """🐸 TTS magic. Run all the models and generate speech. Args: text (str): input text. speaker_name (str, optional): spekaer id for multi-speaker models. Defaults to "". language_name (str, optional): language id for multi-language models. Defaults to "". speaker_wav (Union[str, List[str]], optional): path to the speaker wav. Defaults to None. style_wav ([type], optional): style waveform for GST. Defaults to None. reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None. reference_speaker_name ([type], optional): spekaer id of reference waveform. Defaults to None. Returns: List[int]: [description] """ start_time = time.time() wavs = [] if not text and not reference_wav: raise ValueError( "You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API." ) if text: sens = self.split_into_sentences(text) print(" > Text splitted to sentences.") print(sens) # handle multi-speaker speaker_embedding = None speaker_id = None if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "ids"): if speaker_name and isinstance(speaker_name, str): if self.tts_config.use_d_vector_file: # get the average speaker embedding from the saved d_vectors. speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding( speaker_name, num_samples=None, randomize=False) speaker_embedding = np.array(speaker_embedding)[ None, :] # [1 x embedding_dim] else: # get speaker idx from the speaker name speaker_id = self.tts_model.speaker_manager.ids[ speaker_name] elif not speaker_name and not speaker_wav: raise ValueError( " [!] Look like you use a multi-speaker model. " "You need to define either a `speaker_name` or a `speaker_wav` to use a multi-speaker model." ) else: speaker_embedding = None else: if speaker_name: raise ValueError( f" [!] Missing speakers.json file path for selecting speaker {speaker_name}." "Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. " ) # handle multi-lingaul language_id = None if self.tts_languages_file or ( hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None): if language_name and isinstance(language_name, str): language_id = self.tts_model.language_manager.ids[ language_name] elif not language_name: raise ValueError( " [!] Look like you use a multi-lingual model. " "You need to define either a `language_name` or a `style_wav` to use a multi-lingual model." ) else: raise ValueError( f" [!] Missing language_ids.json file path for selecting language {language_name}." "Define path for language_ids.json if it is a multi-lingual model or remove defined language idx. " ) # compute a new d_vector from the given clip. if speaker_wav is not None: speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip( speaker_wav) use_gl = self.vocoder_model is None if not reference_wav: for sen in sens: # synthesize voice outputs = synthesis( model=self.tts_model, text=sen, CONFIG=self.tts_config, use_cuda=self.use_cuda, speaker_id=speaker_id, language_id=language_id, style_wav=style_wav, use_griffin_lim=use_gl, d_vector=speaker_embedding, ) waveform = outputs["wav"] mel_postnet_spec = outputs["outputs"]["model_outputs"][ 0].detach().cpu().numpy() if not use_gl: # denormalize tts output based on tts audio config mel_postnet_spec = self.tts_model.ap.denormalize( mel_postnet_spec.T).T device_type = "cuda" if self.use_cuda else "cpu" # renormalize spectrogram based on vocoder config vocoder_input = self.vocoder_ap.normalize( mel_postnet_spec.T) # compute scale factor for possible sample rate mismatch scale_factor = [ 1, self.vocoder_config["audio"]["sample_rate"] / self.tts_model.ap.sample_rate, ] if scale_factor[1] != 1: print(" > interpolating tts model output.") vocoder_input = interpolate_vocoder_input( scale_factor, vocoder_input) else: vocoder_input = torch.tensor(vocoder_input).unsqueeze( 0) # pylint: disable=not-callable # run vocoder model # [1, T, C] waveform = self.vocoder_model.inference( vocoder_input.to(device_type)) if self.use_cuda and not use_gl: waveform = waveform.cpu() if not use_gl: waveform = waveform.numpy() waveform = waveform.squeeze() # trim silence if self.tts_config.audio["do_trim_silence"] is True: waveform = trim_silence(waveform, self.tts_model.ap) wavs += list(waveform) wavs += [0] * 10000 else: # get the speaker embedding or speaker id for the reference wav file reference_speaker_embedding = None reference_speaker_id = None if self.tts_speakers_file or hasattr( self.tts_model.speaker_manager, "speaker_ids"): if reference_speaker_name and isinstance( reference_speaker_name, str): if self.tts_config.use_d_vector_file: # get the speaker embedding from the saved d_vectors. reference_speaker_embedding = self.tts_model.speaker_manager.get_embeddings_by_name( reference_speaker_name)[0] reference_speaker_embedding = np.array( reference_speaker_embedding)[ None, :] # [1 x embedding_dim] else: # get speaker idx from the speaker name reference_speaker_id = self.tts_model.speaker_manager.ids[ reference_speaker_name] else: reference_speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip( reference_wav) outputs = transfer_voice( model=self.tts_model, CONFIG=self.tts_config, use_cuda=self.use_cuda, reference_wav=reference_wav, speaker_id=speaker_id, d_vector=speaker_embedding, use_griffin_lim=use_gl, reference_speaker_id=reference_speaker_id, reference_d_vector=reference_speaker_embedding, ) waveform = outputs if not use_gl: mel_postnet_spec = outputs[0].detach().cpu().numpy() # denormalize tts output based on tts audio config mel_postnet_spec = self.tts_model.ap.denormalize( mel_postnet_spec.T).T device_type = "cuda" if self.use_cuda else "cpu" # renormalize spectrogram based on vocoder config vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T) # compute scale factor for possible sample rate mismatch scale_factor = [ 1, self.vocoder_config["audio"]["sample_rate"] / self.tts_model.ap.sample_rate, ] if scale_factor[1] != 1: print(" > interpolating tts model output.") vocoder_input = interpolate_vocoder_input( scale_factor, vocoder_input) else: vocoder_input = torch.tensor(vocoder_input).unsqueeze(0) # pylint: disable=not-callable # run vocoder model # [1, T, C] waveform = self.vocoder_model.inference( vocoder_input.to(device_type)) if self.use_cuda: waveform = waveform.cpu() if not use_gl: waveform = waveform.numpy() wavs = waveform.squeeze() # compute stats process_time = time.time() - start_time audio_time = len(wavs) / self.tts_config.audio["sample_rate"] print(f" > Processing time: {process_time}") print(f" > Real-time factor: {process_time / audio_time}") return wavs