def _init_speaker_encoder(self, speaker_manager): """Initialize the SpeakerEncoder""" if self.encoder_checkpoint: if speaker_manager is None: speaker_manager = SpeakerManager( encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config) else: speaker_manager.init_speaker_encoder(self.encoder_checkpoint, self.encoder_config) return speaker_manager
def test_run_all_models(): """Check if all the models are downloadable and tts models run correctly.""" print(" > Run synthesizer with all the models.") download_dir = get_user_data_dir("tts") output_path = os.path.join(get_tests_output_path(), "output.wav") manager = ModelManager(output_prefix=get_tests_output_path()) model_names = manager.list_models() for model_name in model_names: print(f"\n > Run - {model_name}") model_path, _, _ = manager.download_model(model_name) if "tts_models" in model_name: local_download_dir = os.path.dirname(model_path) # download and run the model speaker_files = glob.glob(local_download_dir + "/speaker*") language_files = glob.glob(local_download_dir + "/language*") language_id = "" if len(speaker_files) > 0: # multi-speaker model if "speaker_ids" in speaker_files[0]: speaker_manager = SpeakerManager( speaker_id_file_path=speaker_files[0]) elif "speakers" in speaker_files[0]: speaker_manager = SpeakerManager( d_vectors_file_path=speaker_files[0]) # multi-lingual model - Assuming multi-lingual models are also multi-speaker if len(language_files ) > 0 and "language_ids" in language_files[0]: language_manager = LanguageManager( language_ids_file_path=language_files[0]) language_id = language_manager.language_names[0] speaker_id = list(speaker_manager.ids.keys())[0] run_cli( f"tts --model_name {model_name} " f'--text "This is an example." --out_path "{output_path}" --speaker_idx "{speaker_id}" --language_idx "{language_id}" ' ) else: # single-speaker model run_cli( f"tts --model_name {model_name} " f'--text "This is an example." --out_path "{output_path}"') # remove downloaded models shutil.rmtree(download_dir) else: # only download the model manager.download_model(model_name) print(f" | > OK: {model_name}") folders = glob.glob(os.path.join(manager.output_prefix, "*")) assert len(folders) == len(model_names) shutil.rmtree(manager.output_prefix)
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data, speaker_manager # Audio processor ap = AudioProcessor(**c.audio) # load data instances meta_data_train, meta_data_eval = load_tts_samples( c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size) # use eval and training partitions meta_data = meta_data_train + meta_data_eval # init speaker manager if c.use_speaker_embedding: speaker_manager = SpeakerManager(data_items=meta_data) elif c.use_d_vector_file: speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file) else: speaker_manager = None # setup model model = setup_model(c) # restore model model.load_checkpoint(c, args.checkpoint_path, eval=True) if use_cuda: model.cuda() num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) # set r r = 1 if c.model.lower() == "glow_tts" else model.decoder.r own_loader = setup_loader(ap, r, verbose=True) extract_spectrograms( own_loader, model, ap, args.output_path, quantized_wav=args.quantized, save_audio=args.save_audio, debug=args.debug, metada_name="metada.txt", )
def _load_speakers(self, speaker_file: str) -> None: """Load the SpeakerManager to organize multi-speaker TTS. It loads the speakers meta-data and the speaker encoder if it is defined. Args: speaker_file (str): path to the speakers meta-data file. """ print("Loading speakers ...") self.speaker_manager = SpeakerManager( encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config) self.speaker_manager.load_d_vectors_file( self.tts_config.get("d_vector_file", speaker_file)) self.num_speakers = self.speaker_manager.num_speakers self.d_vector_dim = self.speaker_manager.d_vector_dim
def init_from_config(config: Coqpit): """Initialize model from config.""" from TTS.utils.audio import AudioProcessor ap = AudioProcessor.init_from_config(config) tokenizer = TTSTokenizer.init_from_config(config) speaker_manager = SpeakerManager.init_from_config(config) return BaseTacotron(config, ap, tokenizer, speaker_manager)
def test_secl_forward(self): num_speakers = 10 num_langs = 3 batch_size = 2 speaker_encoder_config = load_config(SPEAKER_ENCODER_CONFIG) speaker_encoder_config.model_params["use_torch_spec"] = True speaker_encoder = setup_encoder_model(speaker_encoder_config).to( device) speaker_manager = SpeakerManager() speaker_manager.encoder = speaker_encoder args = VitsArgs( language_ids_file=LANG_FILE, use_language_embedding=True, spec_segment_size=10, use_speaker_encoder_as_loss=True, ) config = VitsConfig(num_speakers=num_speakers, use_speaker_embedding=True, model_args=args) config.audio.sample_rate = 16000 input_dummy, input_lengths, _, spec, spec_lengths, waveform = self._create_inputs( config, batch_size=batch_size) speaker_ids = torch.randint(0, num_speakers, (batch_size, )).long().to(device) lang_ids = torch.randint(0, num_langs, (batch_size, )).long().to(device) model = Vits(config, speaker_manager=speaker_manager).to(device) output_dict = model.forward( input_dummy, input_lengths, spec, spec_lengths, waveform, aux_input={ "speaker_ids": speaker_ids, "language_ids": lang_ids }, ) self._check_forward_outputs(config, output_dict, speaker_encoder_config)
def test_speaker_embedding(): # load config config = load_config(encoder_config_path) config.audio.resample = True # create a dummy speaker encoder model = setup_speaker_encoder_model(config) save_checkpoint(model, None, None, get_tests_input_path(), 0) # load audio processor and speaker encoder ap = AudioProcessor(**config.audio) manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path) # load a sample audio and compute embedding waveform = ap.load_wav(sample_wav_path) mel = ap.melspectrogram(waveform) d_vector = manager.compute_d_vector(mel) assert d_vector.shape[1] == 256 # compute d_vector directly from an input file d_vector = manager.compute_d_vector_from_clip(sample_wav_path) d_vector2 = manager.compute_d_vector_from_clip(sample_wav_path) d_vector = torch.FloatTensor(d_vector) d_vector2 = torch.FloatTensor(d_vector2) assert d_vector.shape[0] == 256 assert (d_vector - d_vector2).sum() == 0.0 # compute d_vector from a list of wav files. d_vector3 = manager.compute_d_vector_from_clip([sample_wav_path, sample_wav_path2]) d_vector3 = torch.FloatTensor(d_vector3) assert d_vector3.shape[0] == 256 assert (d_vector - d_vector3).sum() != 0.0 # remove dummy model os.remove(encoder_model_path)
def test_init_multispeaker(self): config = GlowTTSConfig(num_chars=32) model = GlowTTS(config) # speaker embedding with default speaker_embedding_dim config.use_speaker_embedding = True config.num_speakers = 5 config.d_vector_dim = None model.init_multispeaker(config) self.assertEqual(model.c_in_channels, model.hidden_channels_enc) # use external speaker embeddings with speaker_embedding_dim = 301 config = GlowTTSConfig(num_chars=32) config.use_d_vector_file = True config.d_vector_dim = 301 model = GlowTTS(config) model.init_multispeaker(config) self.assertEqual(model.c_in_channels, 301) # use speaker embedddings by the provided speaker_manager config = GlowTTSConfig(num_chars=32) config.use_speaker_embedding = True config.speakers_file = os.path.join(get_tests_data_path(), "ljspeech", "speakers.json") speaker_manager = SpeakerManager.init_from_config(config) model = GlowTTS(config) model.speaker_manager = speaker_manager model.init_multispeaker(config) self.assertEqual(model.c_in_channels, model.hidden_channels_enc) self.assertEqual(model.num_speakers, speaker_manager.num_speakers) # use external speaker embeddings by the provided speaker_manager config = GlowTTSConfig(num_chars=32) config.use_d_vector_file = True config.d_vector_dim = 256 config.d_vector_file = os.path.join(get_tests_data_path(), "dummy_speakers.json") speaker_manager = SpeakerManager.init_from_config(config) model = GlowTTS(config) model.speaker_manager = speaker_manager model.init_multispeaker(config) self.assertEqual(model.c_in_channels, speaker_manager.embedding_dim) self.assertEqual(model.num_speakers, speaker_manager.num_speakers)
def init_from_config(config: "ForwardTTSConfig", samples: Union[List[List], List[Dict]] = None): """Initiate model from config Args: config (ForwardTTSConfig): Model config. samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training. Defaults to None. """ from TTS.utils.audio import AudioProcessor ap = AudioProcessor.init_from_config(config) tokenizer, new_config = TTSTokenizer.init_from_config(config) speaker_manager = SpeakerManager.init_from_config(config, samples) return ForwardTTS(new_config, ap, tokenizer, speaker_manager)
def _init_speaker_manager(self): """Initialize the SpeakerManager""" # setup if multi-speaker settings are in the global model config speaker_manager = None speakers_file = get_from_config_or_model_args_with_default( self.tts_config, "speakers_file", None) if self._is_use_speaker_embedding(): if self.tts_speakers_file: speaker_manager = SpeakerManager( speaker_id_file_path=self.tts_speakers_file) elif speakers_file: speaker_manager = SpeakerManager( speaker_id_file_path=speakers_file) if self._is_use_d_vector_file(): d_vector_file = get_from_config_or_model_args_with_default( self.tts_config, "d_vector_file", None) if self.tts_speakers_file: speaker_manager = SpeakerManager( d_vectors_file_path=self.tts_speakers_file) elif d_vector_file: speaker_manager = SpeakerManager( d_vectors_file_path=d_vector_file) return speaker_manager
def init_from_config(config: "GlowTTSConfig", samples: Union[List[List], List[Dict]] = None, verbose=True): """Initiate model from config Args: config (VitsConfig): Model config. samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training. Defaults to None. verbose (bool): If True, print init messages. Defaults to True. """ from TTS.utils.audio import AudioProcessor ap = AudioProcessor.init_from_config(config, verbose) tokenizer, new_config = TTSTokenizer.init_from_config(config) speaker_manager = SpeakerManager.init_from_config(config, samples) return GlowTTS(new_config, ap, tokenizer, speaker_manager)
def test_speakers_file_processing(self): manager = SpeakerManager(d_vectors_file_path=d_vectors_file_path) self.assertEqual(manager.num_speakers, 1) self.assertEqual(manager.embedding_dim, 256) manager = SpeakerManager(d_vectors_file_path=d_vectors_file_pth_path) self.assertEqual(manager.num_speakers, 1) self.assertEqual(manager.embedding_dim, 256) d_vector = manager.get_embedding_by_clip(manager.clip_ids[0]) assert len(d_vector) == 256 d_vectors = manager.get_embeddings_by_name(manager.speaker_names[0]) assert len(d_vectors[0]) == 256 d_vector1 = manager.get_mean_embedding(manager.speaker_names[0], num_samples=2, randomize=True) assert len(d_vector1) == 256 d_vector2 = manager.get_mean_embedding(manager.speaker_names[0], num_samples=2, randomize=False) assert len(d_vector2) == 256 assert np.sum(np.array(d_vector1) - np.array(d_vector2)) != 0
def test_speakers_file_processing(): manager = SpeakerManager(d_vectors_file_path=d_vectors_file_path) print(manager.num_speakers) print(manager.d_vector_dim) print(manager.clip_ids) d_vector = manager.get_d_vector_by_clip(manager.clip_ids[0]) assert len(d_vector) == 256 d_vectors = manager.get_d_vectors_by_speaker(manager.speaker_names[0]) assert len(d_vectors[0]) == 256 d_vector1 = manager.get_mean_d_vector(manager.speaker_names[0], num_samples=2, randomize=True) assert len(d_vector1) == 256 d_vector2 = manager.get_mean_d_vector(manager.speaker_names[0], num_samples=2, randomize=False) assert len(d_vector2) == 256 assert np.sum(np.array(d_vector1) - np.array(d_vector2)) != 0
if isinstance(wav_file, list): speaker_name = wav_file[2] wav_file = wav_file[1] else: speaker_name = None mel_spec = ap.melspectrogram(ap.load_wav(wav_file, sr=ap.sample_rate)).T mel_spec = torch.FloatTensor(mel_spec[None, :, :]) if args.use_cuda: mel_spec = mel_spec.cuda() embedd = model.compute_embedding(mel_spec) embedd = embedd.detach().cpu().numpy() # create speaker_mapping if target dataset is defined wav_file_name = os.path.basename(wav_file) speaker_mapping[wav_file_name] = {} speaker_mapping[wav_file_name]["name"] = speaker_name speaker_mapping[wav_file_name]["embedding"] = embedd.flatten().tolist() if speaker_mapping: # save speaker_mapping if target dataset is defined if ".json" not in args.output_path: mapping_file_path = os.path.join(args.output_path, "speakers.json") else: mapping_file_path = args.output_path os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True) speaker_manager = SpeakerManager() # pylint: disable=W0212 speaker_manager._save_json(mapping_file_path, speaker_mapping) print("Speaker embeddings saved at:", mapping_file_path)
class Synthesizer(object): def __init__( self, tts_checkpoint: str, tts_config_path: str, tts_speakers_file: str = "", vocoder_checkpoint: str = "", vocoder_config: str = "", encoder_checkpoint: str = "", encoder_config: str = "", use_cuda: bool = False, ) -> None: """General 🐸 TTS interface for inference. It takes a tts and a vocoder model and synthesize speech from the provided text. The text is divided into a list of sentences using `pysbd` and synthesize speech on each sentence separately. If you have certain special characters in your text, you need to handle them before providing the text to Synthesizer. TODO: set the segmenter based on the source language Args: tts_checkpoint (str): path to the tts model file. tts_config_path (str): path to the tts config file. vocoder_checkpoint (str, optional): path to the vocoder model file. Defaults to None. vocoder_config (str, optional): path to the vocoder config file. Defaults to None. encoder_checkpoint (str, optional): path to the speaker encoder model file. Defaults to `""`, encoder_config (str, optional): path to the speaker encoder config file. Defaults to `""`, use_cuda (bool, optional): enable/disable cuda. Defaults to False. """ self.tts_checkpoint = tts_checkpoint self.tts_config_path = tts_config_path self.tts_speakers_file = tts_speakers_file self.vocoder_checkpoint = vocoder_checkpoint self.vocoder_config = vocoder_config self.encoder_checkpoint = encoder_checkpoint self.encoder_config = encoder_config self.use_cuda = use_cuda self.tts_model = None self.vocoder_model = None self.speaker_manager = None self.num_speakers = 0 self.tts_speakers = {} self.d_vector_dim = 0 self.seg = self._get_segmenter("en") self.use_cuda = use_cuda if self.use_cuda: assert torch.cuda.is_available( ), "CUDA is not availabe on this machine." self._load_tts(tts_checkpoint, tts_config_path, use_cuda) self.output_sample_rate = self.tts_config.audio["sample_rate"] if vocoder_checkpoint: self._load_vocoder(vocoder_checkpoint, vocoder_config, use_cuda) self.output_sample_rate = self.vocoder_config.audio["sample_rate"] @staticmethod def _get_segmenter(lang: str): """get the sentence segmenter for the given language. Args: lang (str): target language code. Returns: [type]: [description] """ return pysbd.Segmenter(language=lang, clean=True) def _load_speakers(self, speaker_file: str) -> None: """Load the SpeakerManager to organize multi-speaker TTS. It loads the speakers meta-data and the speaker encoder if it is defined. Args: speaker_file (str): path to the speakers meta-data file. """ print("Loading speakers ...") self.speaker_manager = SpeakerManager( encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config, ) self.speaker_manager.load_d_vectors_file( self.tts_config.get("d_vector_file", speaker_file)) self.num_speakers = self.speaker_manager.num_speakers self.d_vector_dim = self.speaker_manager.d_vector_dim def _set_tts_speaker_file(self): """Set the TTS speaker file used by a multi-speaker model.""" # setup if multi-speaker settings are in the global model config if (hasattr(self.tts_config, "use_speaker_embedding") and self.tts_config.use_speaker_embedding is True): if self.tts_config.use_d_vector_file: self.tts_speakers_file = (self.tts_speakers_file if self.tts_speakers_file else self.tts_config["d_vector_file"]) self.tts_config["d_vector_file"] = self.tts_speakers_file else: self.tts_speakers_file = (self.tts_speakers_file if self.tts_speakers_file else self.tts_config["speakers_file"]) # setup if multi-speaker settings are in the model args config if (self.tts_speakers_file is None and hasattr(self.tts_config, "model_args") and hasattr( self.tts_config.model_args, "use_speaker_embedding") and self.tts_config.model_args.use_speaker_embedding): _args = self.tts_config.model_args if _args.use_d_vector_file: self.tts_speakers_file = (self.tts_speakers_file if self.tts_speakers_file else _args["d_vector_file"]) _args["d_vector_file"] = self.tts_speakers_file else: self.tts_speakers_file = (self.tts_speakers_file if self.tts_speakers_file else _args["speakers_file"]) def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -> None: """Load the TTS model. Args: tts_checkpoint (str): path to the model checkpoint. tts_config_path (str): path to the model config file. use_cuda (bool): enable/disable CUDA use. """ # pylint: disable=global-statement self.tts_config = load_config(tts_config_path) # Patch stats_path stats_path = self.tts_config["audio"].get("stats_path", "") if stats_path and (not os.path.isfile(stats_path)): stats_path = os.path.join(os.path.dirname(tts_checkpoint), os.path.split(stats_path)[1]) self.tts_config["audio"]["stats_path"] = stats_path # Patch speakers file speakers_file = self.tts_config.get("model_args", {}).get("speakers_file", "") if speakers_file and (not os.path.isfile(speakers_file)): speakers_file = os.path.join(os.path.dirname(tts_checkpoint), os.path.split(speakers_file)[1]) self.tts_config["model_args"]["speakers_file"] = speakers_file self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(verbose=False, **self.tts_config.audio) self.tts_model = setup_tts_model(config=self.tts_config) self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) if use_cuda: self.tts_model.cuda() self._set_tts_speaker_file() def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> None: """Load the vocoder model. Args: model_file (str): path to the model checkpoint. model_config (str): path to the model config file. use_cuda (bool): enable/disable CUDA use. """ self.vocoder_config = load_config(model_config) # Patch stats_path stats_path = self.vocoder_config["audio"].get("stats_path", "") if stats_path and (not os.path.isfile(stats_path)): stats_path = os.path.join(os.path.dirname(model_file), os.path.split(stats_path)[1]) self.vocoder_config["audio"]["stats_path"] = stats_path self.vocoder_ap = AudioProcessor(verbose=False, **self.vocoder_config.audio) self.vocoder_model = setup_vocoder_model(self.vocoder_config) self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True) if use_cuda: self.vocoder_model.cuda() def split_into_sentences(self, text) -> List[str]: """Split give text into sentences. Args: text (str): input text in string format. Returns: List[str]: list of sentences. """ return self.seg.segment(text) def save_wav(self, wav: List[int], path: str) -> None: """Save the waveform as a file. Args: wav (List[int]): waveform as a list of values. path (str): output path to save the waveform. """ wav = np.array(wav) self.ap.save_wav(wav, path, self.output_sample_rate) def tts(self, text: str, speaker_idx: str = "", speaker_wav=None, style_wav=None) -> List[int]: """🐸 TTS magic. Run all the models and generate speech. Args: text (str): input text. speaker_idx (str, optional): spekaer id for multi-speaker models. Defaults to "". speaker_wav (): style_wav ([type], optional): style waveform for GST. Defaults to None. Returns: List[int]: [description] """ start_time = time.time() wavs = [] sens = self.split_into_sentences(text) print(" > Text splitted to sentences.") print(sens) # handle multi-speaker speaker_embedding = None speaker_id = None if isinstance(speaker_idx, int): speaker_id = speaker_idx elif self.tts_speakers_file: if speaker_idx and isinstance(speaker_idx, str): if self.tts_config.use_d_vector_file: # get the speaker embedding from the saved d_vectors. speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker( speaker_idx)[0] else: # get speaker idx from the speaker name try: speaker_id = self.tts_model.speaker_manager.speaker_ids[ speaker_idx] except KeyError: # Interpet as int speaker_id = int(speaker_idx) elif not speaker_idx and not speaker_wav: raise ValueError( " [!] Look like you use a multi-speaker model. " "You need to define either a `speaker_idx` or a `style_wav` to use a multi-speaker model." ) else: speaker_embedding = None else: if speaker_idx: raise ValueError( f" [!] Missing speaker.json file path for selecting speaker {speaker_idx}." "Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. " ) # compute a new d_vector from the given clip. if speaker_wav is not None: speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip( speaker_wav) use_gl = self.vocoder_model is None for sen in sens: # synthesize voice outputs = synthesis( model=self.tts_model, text=sen, CONFIG=self.tts_config, use_cuda=self.use_cuda, ap=self.ap, speaker_id=speaker_id, style_wav=style_wav, enable_eos_bos_chars=self.tts_config.enable_eos_bos_chars, use_griffin_lim=use_gl, d_vector=speaker_embedding, ) waveform = outputs["wav"] mel_postnet_spec = ( outputs["outputs"]["model_outputs"][0].detach().cpu().numpy()) if not use_gl: # denormalize tts output based on tts audio config mel_postnet_spec = self.ap.denormalize(mel_postnet_spec.T).T device_type = "cuda" if self.use_cuda else "cpu" # renormalize spectrogram based on vocoder config vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T) # compute scale factor for possible sample rate mismatch scale_factor = [ 1, self.vocoder_config["audio"]["sample_rate"] / self.ap.sample_rate, ] if scale_factor[1] != 1: print(" > interpolating tts model output.") vocoder_input = interpolate_vocoder_input( scale_factor, vocoder_input) else: vocoder_input = torch.tensor(vocoder_input).unsqueeze(0) # pylint: disable=not-callable # run vocoder model # [1, T, C] waveform = self.vocoder_model.inference( vocoder_input.to(device_type)) if self.use_cuda and not use_gl: waveform = waveform.cpu() if not use_gl: waveform = waveform.numpy() waveform = waveform.squeeze() # trim silence waveform = trim_silence(waveform, self.ap) wavs += list(waveform) wavs += [0] * 10000 # compute stats process_time = time.time() - start_time audio_time = len(wavs) / self.tts_config.audio["sample_rate"] print(f" > Processing time: {process_time}") print(f" > Real-time factor: {process_time / audio_time}") return wavs
def main(): """Run `tts` model training directly by a `config.json` file.""" # init trainer args train_args = TrainingArgs() parser = train_args.init_argparse(arg_prefix="") # override trainer args from comman-line args args, config_overrides = parser.parse_known_args() train_args.parse_args(args) # load config.json and register if args.config_path or args.continue_path: if args.config_path: # init from a file config = load_config(args.config_path) if len(config_overrides) > 0: config.parse_known_args(config_overrides, relaxed_parser=True) elif args.continue_path: # continue from a prev experiment config = load_config( os.path.join(args.continue_path, "config.json")) if len(config_overrides) > 0: config.parse_known_args(config_overrides, relaxed_parser=True) else: # init from console args from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel config_base = BaseTrainingConfig() config_base.parse_known_args(config_overrides) config = register_config(config_base.model)() # load training samples train_samples, eval_samples = load_tts_samples(config.datasets, eval_split=True) # setup audio processor ap = AudioProcessor(**config.audio) # init speaker manager if check_config_and_model_args(config, "use_speaker_embedding", True): speaker_manager = SpeakerManager(data_items=train_samples + eval_samples) if hasattr(config, "model_args"): config.model_args.num_speakers = speaker_manager.num_speakers else: config.num_speakers = speaker_manager.num_speakers elif check_config_and_model_args(config, "use_d_vector_file", True): if check_config_and_model_args(config, "use_speaker_encoder_as_loss", True): speaker_manager = SpeakerManager( d_vectors_file_path=config.model_args.d_vector_file, encoder_model_path=config.model_args. speaker_encoder_model_path, encoder_config_path=config.model_args. speaker_encoder_config_path, use_cuda=torch.cuda.is_available(), ) else: speaker_manager = SpeakerManager( d_vectors_file_path=get_from_config_or_model_args( config, "d_vector_file")) config.num_speakers = speaker_manager.num_speakers if hasattr(config, "model_args"): config.model_args.num_speakers = speaker_manager.num_speakers else: speaker_manager = None if check_config_and_model_args(config, "use_language_embedding", True): language_manager = LanguageManager(config=config) if hasattr(config, "model_args"): config.model_args.num_languages = language_manager.num_languages else: config.num_languages = language_manager.num_languages else: language_manager = None # init the model from config model = setup_model(config, speaker_manager, language_manager) # init the trainer and 🚀 trainer = Trainer( train_args, config, config.output_path, model=model, train_samples=train_samples, eval_samples=eval_samples, training_assets={"audio_processor": ap}, parse_command_line_args=False, ) trainer.fit()
args = parser.parse_args() c_dataset = load_config(args.config_dataset_path) meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not args.no_eval) if meta_data_eval is None: wav_files = meta_data_train else: wav_files = meta_data_train + meta_data_eval encoder_manager = SpeakerManager( encoder_model_path=args.model_path, encoder_config_path=args.config_path, d_vectors_file_path=args.old_file, use_cuda=args.use_cuda, ) class_name_key = encoder_manager.encoder_config.class_name_key # compute speaker embeddings speaker_mapping = {} for idx, wav_file in enumerate(tqdm(wav_files)): if isinstance(wav_file, dict): class_name = wav_file[class_name_key] wav_file = wav_file["audio_file"] else: class_name = None wav_file_name = os.path.basename(wav_file)
# LOAD DATA SAMPLES # Each sample is a list of ```[text, audio_file_path, speaker_name]``` # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. train_samples, eval_samples = load_tts_samples( dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size, ) # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader speaker_manager = SpeakerManager() speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name") config.num_speakers = speaker_manager.num_speakers # init model model = GlowTTS(config, ap, tokenizer, speaker_manager=speaker_manager) # INITIALIZE THE TRAINER # Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, # distributed training, etc. trainer = Trainer(TrainerArgs(), config, output_path, model=model, train_samples=train_samples,
help="Path to model config file.", ) parser.add_argument( "config_dataset_path", type=str, help="Path to dataset config file.", ) parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True) parser.add_argument("--eval", type=bool, help="compute eval.", default=True) args = parser.parse_args() c_dataset = load_config(args.config_dataset_path) meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval) items = meta_data_train + meta_data_eval enc_manager = SpeakerManager(encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda) compute_encoder_accuracy(items, enc_manager)
decoder_diff_spec_alpha=0.0, attention_norm="softmax", optimizer="Adam", lr_scheduler=None, lr=3e-5, ) # init audio processor ap = AudioProcessor(**config.audio.to_dict()) # load training samples train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) # init speaker manager for multi-speaker training # it mainly handles speaker-id to speaker-name for the model and the data-loader speaker_manager = SpeakerManager() speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) # init model model = Tacotron2(config, speaker_manager) # init the trainer and 🚀 trainer = Trainer( TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples, training_assets={"audio_processor": ap}, )
parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True) parser.add_argument("--eval", type=bool, help="compute eval.", default=True) args = parser.parse_args() c_dataset = load_config(args.config_dataset_path) meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval) wav_files = meta_data_train + meta_data_eval speaker_manager = SpeakerManager(encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda) # compute speaker embeddings speaker_mapping = {} for idx, wav_file in enumerate(tqdm(wav_files)): if isinstance(wav_file, list): speaker_name = wav_file[2] wav_file = wav_file[1] else: speaker_name = None # extract the embedding embedd = speaker_manager.compute_d_vector_from_clip(wav_file) # create speaker_mapping if target dataset is defined