ref_level_db=20, preemphasis=0.0, ) config = Tacotron2Config( # This is the config that is saved for the future use audio=audio_config, batch_size=64, eval_batch_size=16, num_loader_workers=4, num_eval_loader_workers=4, run_eval=True, test_delay_epochs=-1, ga_alpha=5.0, r=2, attention_type="dynamic_convolution", double_decoder_consistency=True, epochs=1000, text_cleaner="phoneme_cleaners", use_phonemes=True, phoneme_language="en-us", phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), print_step=25, print_eval=True, mixed_precision=False, output_path=output_path, datasets=[dataset_config], ) # init audio processor ap = AudioProcessor(**config.audio.to_dict())
config = Tacotron2Config( # This is the config that is saved for the future use audio=audio_config, batch_size=32, eval_batch_size=16, num_loader_workers=4, num_eval_loader_workers=4, run_eval=True, test_delay_epochs=-1, r=2, # gradual_training=[[0, 6, 48], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]], double_decoder_consistency=True, epochs=1000, text_cleaner="phoneme_cleaners", use_phonemes=True, phoneme_language="en-us", phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), print_step=150, print_eval=False, mixed_precision=True, sort_by_audio_len=True, min_seq_len=14800, max_seq_len=22050 * 10, # 44k is the original sampling rate before resampling, corresponds to 10 seconds of audio output_path=output_path, datasets=[dataset_config], use_speaker_embedding=True, # set this to enable multi-sepeaker training decoder_ssim_alpha=0.0, # disable ssim losses that causes NaN for some runs. postnet_ssim_alpha=0.0, postnet_diff_spec_alpha=0.0, decoder_diff_spec_alpha=0.0, attention_norm="softmax", optimizer="Adam", lr_scheduler=None, lr=3e-5, )
ref_level_db=20, preemphasis=0.0, ) config = Tacotron2Config( # This is the config that is saved for the future use audio=audio_config, batch_size=64, eval_batch_size=16, num_loader_workers=4, num_eval_loader_workers=4, run_eval=True, test_delay_epochs=-1, r=6, gradual_training=[[0, 6, 64], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]], double_decoder_consistency=True, epochs=1000, text_cleaner="phoneme_cleaners", use_phonemes=True, phoneme_language="en-us", phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), print_step=25, print_eval=True, mixed_precision=False, output_path=output_path, datasets=[dataset_config], ) # init audio processor ap = AudioProcessor(**config.audio.to_dict())
from TTS.tts.configs.tacotron2_config import Tacotron2Config config_path = os.path.join(get_tests_output_path(), "test_model_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") config = Tacotron2Config( r=5, batch_size=8, eval_batch_size=8, num_loader_workers=0, num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=False, phoneme_language="en-us", phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), run_eval=True, test_delay_epochs=-1, epochs=1, print_step=1, print_eval=True, test_sentences=[ "Be a voice, not an echo.", ], use_speaker_embedding=True, num_speakers=4, max_decoder_steps=50, ) config.audio.do_trim_silence = True config.audio.trim_db = 60 config.save_json(config_path)
def test_train_step(): config = Tacotron2Config( num_chars=32, num_speakers=10, use_speaker_embedding=True, out_channels=80, decoder_output_dim=80, use_capacitron_vae=True, capacitron_vae=CapacitronVAEConfig(), optimizer="CapacitronOptimizer", optimizer_params={ "RAdam": { "betas": [0.9, 0.998], "weight_decay": 1e-6 }, "SGD": { "lr": 1e-5, "momentum": 0.9 }, }, ) batch = dict({}) batch["text_input"] = torch.randint(0, 24, (8, 128)).long().to(device) batch["text_lengths"] = torch.randint(100, 129, (8, )).long().to(device) batch["text_lengths"] = torch.sort(batch["text_lengths"], descending=True)[0] batch["text_lengths"][0] = 128 batch["mel_input"] = torch.rand(8, 120, config.audio["num_mels"]).to(device) batch["mel_lengths"] = torch.randint(20, 120, (8, )).long().to(device) batch["mel_lengths"] = torch.sort(batch["mel_lengths"], descending=True)[0] batch["mel_lengths"][0] = 120 batch["stop_targets"] = torch.zeros(8, 120, 1).float().to(device) batch["stop_target_lengths"] = torch.randint(0, 120, (8, )).to(device) batch["speaker_ids"] = torch.randint(0, 5, (8, )).long().to(device) batch["d_vectors"] = None for idx in batch["mel_lengths"]: batch["stop_targets"][:, int(idx.item()):, 0] = 1.0 batch["stop_targets"] = batch["stop_targets"].view( batch["text_input"].shape[0], batch["stop_targets"].size(1) // config.r, -1) batch["stop_targets"] = (batch["stop_targets"].sum(2) > 0.0).unsqueeze(2).float().squeeze() model = Tacotron2(config).to(device) criterion = model.get_criterion() optimizer = model.get_optimizer() model.train() model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 for _ in range(10): _, loss_dict = model.train_step(batch, criterion) optimizer.zero_grad() loss_dict["capacitron_vae_beta_loss"].backward() optimizer.first_step() loss_dict["loss"].backward() optimizer.step() # check parameter changes count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional assert (param != param_ref).any( ), "param {} with shape {} not updated!! \n{}\n{}".format( count, param.shape, param, param_ref) count += 1
from tests import get_tests_input_path from TTS.tts.configs.shared_configs import CapacitronVAEConfig, GSTConfig from TTS.tts.configs.tacotron2_config import Tacotron2Config from TTS.tts.layers.losses import MSELossMasked from TTS.tts.models.tacotron2 import Tacotron2 from TTS.utils.audio import AudioProcessor # pylint: disable=unused-variable torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") config_global = Tacotron2Config(num_chars=32, num_speakers=5, out_channels=80, decoder_output_dim=80) ap = AudioProcessor(**config_global.audio) WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") class TacotronTrainTest(unittest.TestCase): """Test vanilla Tacotron2 model.""" def test_train_step(self): # pylint: disable=no-self-use config = config_global.copy() config.use_speaker_embedding = False config.num_speakers = 1 input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8, )).long().to(device)
import tensorflow as tf import torch from TTS.tts.configs.tacotron2_config import Tacotron2Config from TTS.tts.tf.models.tacotron2 import Tacotron2 from TTS.tts.tf.utils.tflite import convert_tacotron2_to_tflite, load_tflite_model tf.get_logger().setLevel("INFO") # pylint: disable=unused-variable torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") c = Tacotron2Config() class TacotronTFTrainTest(unittest.TestCase): @staticmethod def generate_dummy_inputs(): chars_seq = torch.randint(0, 24, (8, 128)).long().to(device) chars_seq_lengths = torch.randint(100, 128, (8, )).long().to(device) chars_seq_lengths = torch.sort(chars_seq_lengths, descending=True)[0] mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8, )).long().to(device) stop_targets = torch.zeros(8, 30, 1).float().to(device) speaker_ids = torch.randint(0, 5, (8, )).long().to(device) chars_seq = tf.convert_to_tensor(chars_seq.cpu().numpy())
config = Tacotron2Config( run_name="Capacitron-Tacotron2", audio=audio_config, capacitron_vae=capacitron_config, use_capacitron_vae=True, batch_size=128, # Tune this to your gpu max_audio_len=8 * 22050, # Tune this to your gpu min_audio_len=1 * 22050, eval_batch_size=16, num_loader_workers=8, num_eval_loader_workers=8, precompute_num_workers=24, run_eval=True, test_delay_epochs=25, ga_alpha=0.0, r=2, optimizer="CapacitronOptimizer", optimizer_params={ "RAdam": { "betas": [0.9, 0.998], "weight_decay": 1e-6 }, "SGD": { "lr": 1e-5, "momentum": 0.9 } }, attention_type="dynamic_convolution", grad_clip= 0.0, # Important! We overwrite the standard grad_clip with capacitron_grad_clip double_decoder_consistency=False, epochs=1000, text_cleaner="phoneme_cleaners", use_phonemes=True, phoneme_language="en-us", phonemizer="espeak", phoneme_cache_path=os.path.join(data_path, "phoneme_cache"), stopnet_pos_weight=15, print_step=25, print_eval=True, mixed_precision=False, sort_by_audio_len=True, seq_len_norm=True, output_path=output_path, datasets=[dataset_config], lr=1e-3, lr_scheduler="StepwiseGradualLR", lr_scheduler_params={ "gradual_learning_rates": [ [0, 1e-3], [2e4, 5e-4], [4e5, 3e-4], [6e4, 1e-4], [8e4, 5e-5], ] }, scheduler_after_epoch=False, # scheduler doesn't work without this flag # Need to experiment with these below for capacitron loss_masking=False, decoder_loss_alpha=1.0, postnet_loss_alpha=1.0, postnet_diff_spec_alpha=0.0, decoder_diff_spec_alpha=0.0, decoder_ssim_alpha=0.0, postnet_ssim_alpha=0.0, )
config = Tacotron2Config( # This is the config that is saved for the future use audio=audio_config, batch_size=40, # BS of 40 and max length of 10s will use about 20GB of GPU memory eval_batch_size=16, num_loader_workers=4, num_eval_loader_workers=4, run_eval=True, test_delay_epochs=-1, r=6, gradual_training=[[0, 6, 64], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]], double_decoder_consistency=True, epochs=1000, text_cleaner="phoneme_cleaners", use_phonemes=True, phoneme_language="de", phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), precompute_num_workers=8, print_step=25, print_eval=True, mixed_precision=False, test_sentences=[ "Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.", "Sei eine Stimme, kein Echo.", "Es tut mir Leid David. Das kann ich leider nicht machen.", "Dieser Kuchen ist großartig. Er ist so lecker und feucht.", "Vor dem 22. November 1963.", ], # max audio length of 10 seconds, feel free to increase if you got more than 20GB GPU memory max_audio_len=22050 * 10, output_path=output_path, datasets=[dataset_config], )