def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> None: """Load the vocoder model. 1. Load the vocoder config. 2. Init the AudioProcessor for the vocoder. 3. Init the vocoder model from the config. 4. Move the model to the GPU if CUDA is enabled. Args: model_file (str): path to the model checkpoint. model_config (str): path to the model config file. use_cuda (bool): enable/disable CUDA use. """ self.vocoder_config = load_config(model_config) self.vocoder_ap = AudioProcessor(verbose=False, **self.vocoder_config.audio) self.vocoder_model = setup_vocoder_model(self.vocoder_config) self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True) if use_cuda: self.vocoder_model.cuda()
def test_speaker_embedding(): # load config config = load_config(encoder_config_path) config.audio.resample = True # create a dummy speaker encoder model = setup_model(config) save_checkpoint(model, None, None, get_tests_input_path(), 0) # load audio processor and speaker encoder ap = AudioProcessor(**config.audio) manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path) # load a sample audio and compute embedding waveform = ap.load_wav(sample_wav_path) mel = ap.melspectrogram(waveform) d_vector = manager.compute_d_vector(mel.T) assert d_vector.shape[1] == 256 # compute d_vector directly from an input file d_vector = manager.compute_d_vector_from_clip(sample_wav_path) d_vector2 = manager.compute_d_vector_from_clip(sample_wav_path) d_vector = torch.FloatTensor(d_vector) d_vector2 = torch.FloatTensor(d_vector2) assert d_vector.shape[0] == 256 assert (d_vector - d_vector2).sum() == 0.0 # compute d_vector from a list of wav files. d_vector3 = manager.compute_d_vector_from_clip( [sample_wav_path, sample_wav_path2]) d_vector3 = torch.FloatTensor(d_vector3) assert d_vector3.shape[0] == 256 assert (d_vector - d_vector3).sum() != 0.0 # remove dummy model os.remove(encoder_model_path)
def main(): # pylint: disable=bad-option-value parser = argparse.ArgumentParser( description= """Find all the unique characters or phonemes in a dataset.\n\n""" """ Example runs: python TTS/bin/find_unique_chars.py --config_path config.json """, formatter_class=RawTextHelpFormatter, ) parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True) args = parser.parse_args() c = load_config(args.config_path) # load all datasets train_items, eval_items = load_meta_data(c.datasets, eval_split=True) items = train_items + eval_items texts = "".join(item[0] for item in items) chars = set(texts) lower_chars = filter(lambda c: c.islower(), chars) chars_force_lower = [c.lower() for c in chars] chars_force_lower = set(chars_force_lower) print(f" > Number of unique characters: {len(chars)}") print(f" > Unique characters: {''.join(sorted(chars))}") print(f" > Unique lower characters: {''.join(sorted(lower_chars))}") print( f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}" )
def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -> None: """Load the TTS model. 1. Load the model config. 2. Init the AudioProcessor. 3. Init the model from the config. 4. Move the model to the GPU if CUDA is enabled. 5. Init the speaker manager for the model. Args: tts_checkpoint (str): path to the model checkpoint. tts_config_path (str): path to the model config file. use_cuda (bool): enable/disable CUDA use. """ # pylint: disable=global-statement self.tts_config = load_config(tts_config_path) if self.tts_config[ "use_phonemes"] and self.tts_config["phonemizer"] is None: raise ValueError("Phonemizer is not defined in the TTS config.") self.tts_model = setup_tts_model(config=self.tts_config) if not self.encoder_checkpoint: self._set_speaker_encoder_paths_from_tts_config() self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) if use_cuda: self.tts_model.cuda() if self.encoder_checkpoint and hasattr(self.tts_model, "speaker_manager"): self.tts_model.speaker_manager.init_encoder( self.encoder_checkpoint, self.encoder_config)
def main(): # pylint: disable=W0601 global c # pylint: disable=bad-option-value parser = argparse.ArgumentParser( description="""Find all the unique characters or phonemes in a dataset.\n\n""" """ Example runs: python TTS/bin/find_unique_chars.py --config_path config.json """, formatter_class=RawTextHelpFormatter, ) parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True) args = parser.parse_args() c = load_config(args.config_path) # load all datasets train_items, eval_items = load_tts_samples(c.datasets, eval_split=True) items = train_items + eval_items print("Num items:", len(items)) phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15) phones = [] for ph in phonemes: phones.extend(ph) phones = set(phones) lower_phones = filter(lambda c: c.islower(), phones) phones_force_lower = [c.lower() for c in phones] phones_force_lower = set(phones_force_lower) print(f" > Number of unique phonemes: {len(phones)}") print(f" > Unique phonemes: {''.join(sorted(phones))}") print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}") print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
default="", help= "Target dataset to pick a processor from TTS.tts.dataset.preprocess. Necessary to create a speakers.json file.", ) parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True) parser.add_argument( "--separator", type=str, help="Separator used in file if CSV is passed for data_path", default="|") args = parser.parse_args() c = load_config(args.config_path) ap = AudioProcessor(**c["audio"]) data_path = args.data_path split_ext = os.path.splitext(data_path) sep = args.separator if args.target_dataset != "": # if target dataset is defined dataset_config = [ BaseDatasetConfig(name=args.target_dataset, path=args.data_path, meta_file_train=None, meta_file_val=None), ] wav_files, _ = load_meta_data(dataset_config, eval_split=False)
def main(): """Run `tts` model training directly by a `config.json` file.""" # init trainer args train_args = TrainingArgs() parser = train_args.init_argparse(arg_prefix="") # override trainer args from comman-line args args, config_overrides = parser.parse_known_args() train_args.parse_args(args) # load config.json and register if args.config_path or args.continue_path: if args.config_path: # init from a file config = load_config(args.config_path) if len(config_overrides) > 0: config.parse_known_args(config_overrides, relaxed_parser=True) elif args.continue_path: # continue from a prev experiment config = load_config( os.path.join(args.continue_path, "config.json")) if len(config_overrides) > 0: config.parse_known_args(config_overrides, relaxed_parser=True) else: # init from console args from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel config_base = BaseTrainingConfig() config_base.parse_known_args(config_overrides) config = register_config(config_base.model)() # load training samples train_samples, eval_samples = load_tts_samples(config.datasets, eval_split=True) # setup audio processor ap = AudioProcessor(**config.audio) # init speaker manager if check_config_and_model_args(config, "use_speaker_embedding", True): speaker_manager = SpeakerManager(data_items=train_samples + eval_samples) if hasattr(config, "model_args"): config.model_args.num_speakers = speaker_manager.num_speakers else: config.num_speakers = speaker_manager.num_speakers elif check_config_and_model_args(config, "use_d_vector_file", True): if check_config_and_model_args(config, "use_speaker_encoder_as_loss", True): speaker_manager = SpeakerManager( d_vectors_file_path=config.model_args.d_vector_file, encoder_model_path=config.model_args. speaker_encoder_model_path, encoder_config_path=config.model_args. speaker_encoder_config_path, use_cuda=torch.cuda.is_available(), ) else: speaker_manager = SpeakerManager( d_vectors_file_path=get_from_config_or_model_args( config, "d_vector_file")) config.num_speakers = speaker_manager.num_speakers if hasattr(config, "model_args"): config.model_args.num_speakers = speaker_manager.num_speakers else: speaker_manager = None if check_config_and_model_args(config, "use_language_embedding", True): language_manager = LanguageManager(config=config) if hasattr(config, "model_args"): config.model_args.num_languages = language_manager.num_languages else: config.num_languages = language_manager.num_languages else: language_manager = None # init the model from config model = setup_model(config, speaker_manager, language_manager) # init the trainer and 🚀 trainer = Trainer( train_args, config, config.output_path, model=model, train_samples=train_samples, eval_samples=eval_samples, training_assets={"audio_processor": ap}, parse_command_line_args=False, ) trainer.fit()
"--old_file", type=str, help="Previous speakers.json file, only compute for new audios.", default=None) parser.add_argument("--use_cuda", type=bool, help="flag to set cuda. Default False", default=False) parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False) args = parser.parse_args() c_dataset = load_config(args.config_dataset_path) meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not args.no_eval) if meta_data_eval is None: wav_files = meta_data_train else: wav_files = meta_data_train + meta_data_eval encoder_manager = SpeakerManager( encoder_model_path=args.model_path, encoder_config_path=args.config_path, d_vectors_file_path=args.old_file, use_cuda=args.use_cuda, )
def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.") parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.") parser.add_argument("out_path", type=str, help="save path (directory and filename).") parser.add_argument( "--data_path", type=str, required=False, help="folder including the target set of wavs overriding dataset config.", ) args, overrides = parser.parse_known_args() CONFIG = load_config(args.config_path) CONFIG.parse_known_args(overrides, relaxed_parser=True) # load config CONFIG.audio.signal_norm = False # do not apply earlier normalization CONFIG.audio.stats_path = None # discard pre-defined stats # load audio processor ap = AudioProcessor(**CONFIG.audio.to_dict()) # load the meta data of target dataset if args.data_path: dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True) else: dataset_items = load_meta_data(CONFIG.datasets)[0] # take only train data print(f" > There are {len(dataset_items)} files.") mel_sum = 0 mel_square_sum = 0 linear_sum = 0 linear_square_sum = 0 N = 0 for item in tqdm(dataset_items): # compute features wav = ap.load_wav(item if isinstance(item, str) else item[1]) linear = ap.spectrogram(wav) mel = ap.melspectrogram(wav) # compute stats N += mel.shape[1] mel_sum += mel.sum(1) linear_sum += linear.sum(1) mel_square_sum += (mel ** 2).sum(axis=1) linear_square_sum += (linear ** 2).sum(axis=1) mel_mean = mel_sum / N mel_scale = np.sqrt(mel_square_sum / N - mel_mean ** 2) linear_mean = linear_sum / N linear_scale = np.sqrt(linear_square_sum / N - linear_mean ** 2) output_file_path = args.out_path stats = {} stats["mel_mean"] = mel_mean stats["mel_std"] = mel_scale stats["linear_mean"] = linear_mean stats["linear_std"] = linear_scale print(f" > Avg mel spec mean: {mel_mean.mean()}") print(f" > Avg mel spec scale: {mel_scale.mean()}") print(f" > Avg linear spec mean: {linear_mean.mean()}") print(f" > Avg lienar spec scale: {linear_scale.mean()}") # set default config values for mean-var scaling CONFIG.audio.stats_path = output_file_path CONFIG.audio.signal_norm = True # remove redundant values del CONFIG.audio.max_norm del CONFIG.audio.min_level_db del CONFIG.audio.symmetric_norm del CONFIG.audio.clip_norm stats["audio_config"] = CONFIG.audio.to_dict() np.save(output_file_path, stats, allow_pickle=True) print(f" > stats saved to {output_file_path}")
def download_model(self, model_name): """Download model files given the full model name. Model name is in the format 'type/language/dataset/model' e.g. 'tts_model/en/ljspeech/tacotron' Every model must have the following files - *.pth.tar : pytorch model checkpoint file. - config.json : model config file. - scale_stats.npy (if exist): scale values for preprocessing. Args: model_name (str): model name as explained above. TODO: support multi-speaker models """ # fetch model info from the dict model_type, lang, dataset, model = model_name.split("/") model_full_name = f"{model_type}--{lang}--{dataset}--{model}" model_item = self.models_dict[model_type][lang][dataset][model] # set the model specific output path output_path = os.path.join(self.output_prefix, model_full_name) output_model_path = os.path.join(output_path, "model_file.pth.tar") output_config_path = os.path.join(output_path, "config.json") # NOTE : band-aid for removing phoneme support if "needs_phonemizer" in model_item and model_item["needs_phonemizer"]: raise RuntimeError( " [!] Use 🐸TTS <= v0.0.13 for this model. Current version does not support phoneme based models." ) if os.path.exists(output_path): print(f" > {model_name} is already downloaded.") else: os.makedirs(output_path, exist_ok=True) print(f" > Downloading model to {output_path}") output_stats_path = os.path.join(output_path, "scale_stats.npy") output_speakers_path = os.path.join(output_path, "speakers.json") # download files to the output path if self._check_dict_key(model_item, "github_rls_url"): # download from github release # TODO: pass output_path self._download_zip_file(model_item["github_rls_url"], output_path) else: # download from gdrive self._download_gdrive_file(model_item["model_file"], output_model_path) self._download_gdrive_file(model_item["config_file"], output_config_path) if self._check_dict_key(model_item, "stats_file"): self._download_gdrive_file(model_item["stats_file"], output_stats_path) # update the scale_path.npy file path in the model config.json if self._check_dict_key( model_item, "stats_file") or os.path.exists(output_stats_path): # set scale stats path in config.json config_path = output_config_path config = load_config(config_path) config.audio.stats_path = output_stats_path config.save_json(config_path) # update the speakers.json file path in the model config.json to the current path if os.path.exists(output_speakers_path): # set scale stats path in config.json config_path = output_config_path config = load_config(config_path) config.external_speaker_embedding_file = output_speakers_path config.save_json(config_path) return output_model_path, output_config_path, model_item
def process_args(args, config=None): """Process parsed comand line arguments and initialize the config if not provided. Args: args (argparse.Namespace or dict like): Parsed input arguments. config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None. Returns: c (TTS.utils.io.AttrDict): Config paramaters. out_path (str): Path to save models and logging. audio_path (str): Path to save generated test audios. c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does logging to the console. dashboard_logger (WandbLogger or TensorboardLogger): Class that does the dashboard Logging TODO: - Interactive config definition. """ if isinstance(args, tuple): args, coqpit_overrides = args if args.continue_path: # continue a previous training from its output folder experiment_path = args.continue_path args.config_path = os.path.join(args.continue_path, "config.json") args.restore_path, best_model = get_last_checkpoint(args.continue_path) if not args.best_path: args.best_path = best_model # init config if not already defined if config is None: if args.config_path: # init from a file config = load_config(args.config_path) else: # init from console args from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel config_base = BaseTrainingConfig() config_base.parse_known_args(coqpit_overrides) config = register_config(config_base.model)() # override values from command-line args config.parse_known_args(coqpit_overrides, relaxed_parser=True) experiment_path = args.continue_path if not experiment_path: experiment_path = get_experiment_folder_path(config.output_path, config.run_name) audio_path = os.path.join(experiment_path, "test_audios") config.output_log_path = experiment_path # setup rank 0 process in distributed training dashboard_logger = None if args.rank == 0: new_fields = {} if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() # if model characters are not set in the config file # save the default set to the config file for future # compatibility. if config.has("characters") and config.characters is None: used_characters = parse_symbols() new_fields["characters"] = used_characters copy_model_files(config, experiment_path, new_fields) dashboard_logger = logger_factory(config, experiment_path) c_logger = ConsoleLogger() return config, experiment_path, audio_path, c_logger, dashboard_logger