from TTS.tts.models.tacotron2 import Tacotron2 from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/")) audio_config = BaseAudioConfig( sample_rate=22050, resample= False, # Resample to 22050 Hz. It slows down training. Use `TTS/bin/resample.py` to pre-resample and set this False for faster training. do_trim_silence=True, trim_db=23.0, signal_norm=False, mel_fmin=0.0, mel_fmax=8000, spec_gain=1.0, log_func="np.log", preemphasis=0.0, ) config = Tacotron2Config( # This is the config that is saved for the future use audio=audio_config, batch_size=32, eval_batch_size=16, num_loader_workers=4, num_eval_loader_workers=4, run_eval=True, test_delay_epochs=-1,
# download the dataset if not downloaded if not os.path.exists(dataset_path): from TTS.utils.downloaders import download_vctk download_vctk(dataset_path) # define dataset config dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=dataset_path) # define audio config # ❗ resample the dataset externally using `TTS/bin/resample.py` and set `resample=False` for faster training audio_config = BaseAudioConfig(sample_rate=22050, resample=True, do_trim_silence=True, trim_db=23.0) # define model config config = GlowTTSConfig( batch_size=64, eval_batch_size=16, num_loader_workers=4, num_eval_loader_workers=4, precompute_num_workers=4, run_eval=True, test_delay_epochs=-1, epochs=1000, text_cleaner="phoneme_cleaners", use_phonemes=True, phoneme_language="en-us",
# from TTS.tts.datasets.tokenizer import Tokenizer output_path = os.path.dirname(os.path.abspath(__file__)) # init configs dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")) audio_config = BaseAudioConfig( sample_rate=22050, do_trim_silence=True, trim_db=60.0, signal_norm=False, mel_fmin=0.0, mel_fmax=8000, spec_gain=1.0, log_func="np.log", ref_level_db=20, preemphasis=0.0, ) config = Tacotron2Config( # This is the config that is saved for the future use audio=audio_config, batch_size=64, eval_batch_size=16, num_loader_workers=4, num_eval_loader_workers=4, run_eval=True, test_delay_epochs=-1, ga_alpha=5.0,
from TTS.config.shared_configs import BaseAudioConfig from TTS.trainer import Trainer, TrainingArgs, init_training from TTS.tts.configs import BaseDatasetConfig, VitsConfig output_path = os.path.dirname(os.path.abspath(__file__)) dataset_config = BaseDatasetConfig( name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/") ) audio_config = BaseAudioConfig( sample_rate=22050, win_length=1024, hop_length=256, num_mels=80, preemphasis=0.0, ref_level_db=20, log_func="np.log", do_trim_silence=True, trim_db=45, mel_fmin=0, mel_fmax=None, spec_gain=1.0, signal_norm=False, do_amp_to_db_linear=False, ) config = VitsConfig( audio=audio_config, run_name="vits_ljspeech", batch_size=48, eval_batch_size=16, batch_group_size=5, num_loader_workers=4, num_eval_loader_workers=4,
from TTS.config.shared_configs import BaseAudioConfig from TTS.speaker_encoder.speaker_encoder_config import SpeakerEncoderConfig config_path = os.path.join(get_tests_output_path(), "test_model_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") config = SpeakerEncoderConfig( batch_size=4, num_speakers_in_batch=1, num_utters_per_speaker=10, num_loader_workers=0, max_train_step=2, print_step=1, save_step=1, print_eval=True, audio=BaseAudioConfig(num_mels=40), ) config.audio.do_trim_silence = True config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch command_train = ( f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} " f"--coqpit.output_path {output_path} " "--coqpit.datasets.0.name ljspeech " "--coqpit.datasets.0.meta_file_train metadata.csv " "--coqpit.datasets.0.meta_file_val metadata.csv " "--coqpit.datasets.0.path tests/data/ljspeech ") run_cli(command_train)
output_path = os.path.dirname(os.path.abspath(__file__)) data_path = "/srv/data/" # Using LJSpeech like dataset processing for the blizzard dataset dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=data_path) audio_config = BaseAudioConfig( sample_rate=24000, do_trim_silence=True, trim_db=60.0, signal_norm=True, mel_fmin=80.0, mel_fmax=12000, spec_gain=20.0, log_func="np.log10", ref_level_db=20, preemphasis=0.0, min_level_db=-100, ) # Using the standard Capacitron config capacitron_config = CapacitronVAEConfig(capacitron_VAE_loss_alpha=1.0) config = TacotronConfig( run_name="Blizzard-Capacitron-T1", audio=audio_config, capacitron_vae=capacitron_config, use_capacitron_vae=True,