Example #1
0
    ref_level_db=20,
    preemphasis=0.0,
)

config = Tacotron2Config(  # This is the config that is saved for the future use
    audio=audio_config,
    batch_size=64,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    ga_alpha=5.0,
    r=2,
    attention_type="dynamic_convolution",
    double_decoder_consistency=True,
    epochs=1000,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=25,
    print_eval=True,
    mixed_precision=False,
    output_path=output_path,
    datasets=[dataset_config],
)

# init audio processor
ap = AudioProcessor(**config.audio.to_dict())
Example #2
0
config = Tacotron2Config(  # This is the config that is saved for the future use
    audio=audio_config,
    batch_size=32,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    r=2,
    # gradual_training=[[0, 6, 48], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]],
    double_decoder_consistency=True,
    epochs=1000,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=150,
    print_eval=False,
    mixed_precision=True,
    sort_by_audio_len=True,
    min_seq_len=14800,
    max_seq_len=22050 *
    10,  # 44k is the original sampling rate before resampling, corresponds to 10 seconds of audio
    output_path=output_path,
    datasets=[dataset_config],
    use_speaker_embedding=True,  # set this to enable multi-sepeaker training
    decoder_ssim_alpha=0.0,  # disable ssim losses that causes NaN for some runs.
    postnet_ssim_alpha=0.0,
    postnet_diff_spec_alpha=0.0,
    decoder_diff_spec_alpha=0.0,
    attention_norm="softmax",
    optimizer="Adam",
    lr_scheduler=None,
    lr=3e-5,
)
Example #3
0
    ref_level_db=20,
    preemphasis=0.0,
)

config = Tacotron2Config(  # This is the config that is saved for the future use
    audio=audio_config,
    batch_size=64,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    r=6,
    gradual_training=[[0, 6, 64], [10000, 4, 32], [50000, 3, 32],
                      [100000, 2, 32]],
    double_decoder_consistency=True,
    epochs=1000,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=25,
    print_eval=True,
    mixed_precision=False,
    output_path=output_path,
    datasets=[dataset_config],
)

# init audio processor
ap = AudioProcessor(**config.audio.to_dict())
Example #4
0
from TTS.tts.configs.tacotron2_config import Tacotron2Config

config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
output_path = os.path.join(get_tests_output_path(), "train_outputs")

config = Tacotron2Config(
    r=5,
    batch_size=8,
    eval_batch_size=8,
    num_loader_workers=0,
    num_eval_loader_workers=0,
    text_cleaner="english_cleaners",
    use_phonemes=False,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1,
    print_step=1,
    print_eval=True,
    test_sentences=[
        "Be a voice, not an echo.",
    ],
    use_speaker_embedding=True,
    num_speakers=4,
    max_decoder_steps=50,
)

config.audio.do_trim_silence = True
config.audio.trim_db = 60
config.save_json(config_path)
Example #5
0
    def test_train_step():
        config = Tacotron2Config(
            num_chars=32,
            num_speakers=10,
            use_speaker_embedding=True,
            out_channels=80,
            decoder_output_dim=80,
            use_capacitron_vae=True,
            capacitron_vae=CapacitronVAEConfig(),
            optimizer="CapacitronOptimizer",
            optimizer_params={
                "RAdam": {
                    "betas": [0.9, 0.998],
                    "weight_decay": 1e-6
                },
                "SGD": {
                    "lr": 1e-5,
                    "momentum": 0.9
                },
            },
        )

        batch = dict({})
        batch["text_input"] = torch.randint(0, 24, (8, 128)).long().to(device)
        batch["text_lengths"] = torch.randint(100, 129,
                                              (8, )).long().to(device)
        batch["text_lengths"] = torch.sort(batch["text_lengths"],
                                           descending=True)[0]
        batch["text_lengths"][0] = 128
        batch["mel_input"] = torch.rand(8, 120,
                                        config.audio["num_mels"]).to(device)
        batch["mel_lengths"] = torch.randint(20, 120, (8, )).long().to(device)
        batch["mel_lengths"] = torch.sort(batch["mel_lengths"],
                                          descending=True)[0]
        batch["mel_lengths"][0] = 120
        batch["stop_targets"] = torch.zeros(8, 120, 1).float().to(device)
        batch["stop_target_lengths"] = torch.randint(0, 120, (8, )).to(device)
        batch["speaker_ids"] = torch.randint(0, 5, (8, )).long().to(device)
        batch["d_vectors"] = None

        for idx in batch["mel_lengths"]:
            batch["stop_targets"][:, int(idx.item()):, 0] = 1.0

        batch["stop_targets"] = batch["stop_targets"].view(
            batch["text_input"].shape[0],
            batch["stop_targets"].size(1) // config.r, -1)
        batch["stop_targets"] = (batch["stop_targets"].sum(2) >
                                 0.0).unsqueeze(2).float().squeeze()

        model = Tacotron2(config).to(device)
        criterion = model.get_criterion()
        optimizer = model.get_optimizer()

        model.train()
        model_ref = copy.deepcopy(model)
        count = 0
        for param, param_ref in zip(model.parameters(),
                                    model_ref.parameters()):
            assert (param - param_ref).sum() == 0, param
            count += 1
        for _ in range(10):
            _, loss_dict = model.train_step(batch, criterion)
            optimizer.zero_grad()
            loss_dict["capacitron_vae_beta_loss"].backward()
            optimizer.first_step()
            loss_dict["loss"].backward()
            optimizer.step()
        # check parameter changes
        count = 0
        for param, param_ref in zip(model.parameters(),
                                    model_ref.parameters()):
            # ignore pre-higway layer since it works conditional
            assert (param != param_ref).any(
            ), "param {} with shape {} not updated!! \n{}\n{}".format(
                count, param.shape, param, param_ref)
            count += 1
Example #6
0
from tests import get_tests_input_path
from TTS.tts.configs.shared_configs import CapacitronVAEConfig, GSTConfig
from TTS.tts.configs.tacotron2_config import Tacotron2Config
from TTS.tts.layers.losses import MSELossMasked
from TTS.tts.models.tacotron2 import Tacotron2
from TTS.utils.audio import AudioProcessor

# pylint: disable=unused-variable

torch.manual_seed(1)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

config_global = Tacotron2Config(num_chars=32,
                                num_speakers=5,
                                out_channels=80,
                                decoder_output_dim=80)

ap = AudioProcessor(**config_global.audio)
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")


class TacotronTrainTest(unittest.TestCase):
    """Test vanilla Tacotron2 model."""
    def test_train_step(self):  # pylint: disable=no-self-use
        config = config_global.copy()
        config.use_speaker_embedding = False
        config.num_speakers = 1

        input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
        input_lengths = torch.randint(100, 128, (8, )).long().to(device)
Example #7
0
import tensorflow as tf
import torch

from TTS.tts.configs.tacotron2_config import Tacotron2Config
from TTS.tts.tf.models.tacotron2 import Tacotron2
from TTS.tts.tf.utils.tflite import convert_tacotron2_to_tflite, load_tflite_model

tf.get_logger().setLevel("INFO")

# pylint: disable=unused-variable

torch.manual_seed(1)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

c = Tacotron2Config()


class TacotronTFTrainTest(unittest.TestCase):
    @staticmethod
    def generate_dummy_inputs():
        chars_seq = torch.randint(0, 24, (8, 128)).long().to(device)
        chars_seq_lengths = torch.randint(100, 128, (8, )).long().to(device)
        chars_seq_lengths = torch.sort(chars_seq_lengths, descending=True)[0]
        mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
        mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
        mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
        stop_targets = torch.zeros(8, 30, 1).float().to(device)
        speaker_ids = torch.randint(0, 5, (8, )).long().to(device)

        chars_seq = tf.convert_to_tensor(chars_seq.cpu().numpy())
Example #8
0
config = Tacotron2Config(
    run_name="Capacitron-Tacotron2",
    audio=audio_config,
    capacitron_vae=capacitron_config,
    use_capacitron_vae=True,
    batch_size=128,  # Tune this to your gpu
    max_audio_len=8 * 22050,  # Tune this to your gpu
    min_audio_len=1 * 22050,
    eval_batch_size=16,
    num_loader_workers=8,
    num_eval_loader_workers=8,
    precompute_num_workers=24,
    run_eval=True,
    test_delay_epochs=25,
    ga_alpha=0.0,
    r=2,
    optimizer="CapacitronOptimizer",
    optimizer_params={
        "RAdam": {
            "betas": [0.9, 0.998],
            "weight_decay": 1e-6
        },
        "SGD": {
            "lr": 1e-5,
            "momentum": 0.9
        }
    },
    attention_type="dynamic_convolution",
    grad_clip=
    0.0,  # Important! We overwrite the standard grad_clip with capacitron_grad_clip
    double_decoder_consistency=False,
    epochs=1000,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phonemizer="espeak",
    phoneme_cache_path=os.path.join(data_path, "phoneme_cache"),
    stopnet_pos_weight=15,
    print_step=25,
    print_eval=True,
    mixed_precision=False,
    sort_by_audio_len=True,
    seq_len_norm=True,
    output_path=output_path,
    datasets=[dataset_config],
    lr=1e-3,
    lr_scheduler="StepwiseGradualLR",
    lr_scheduler_params={
        "gradual_learning_rates": [
            [0, 1e-3],
            [2e4, 5e-4],
            [4e5, 3e-4],
            [6e4, 1e-4],
            [8e4, 5e-5],
        ]
    },
    scheduler_after_epoch=False,  # scheduler doesn't work without this flag
    # Need to experiment with these below for capacitron
    loss_masking=False,
    decoder_loss_alpha=1.0,
    postnet_loss_alpha=1.0,
    postnet_diff_spec_alpha=0.0,
    decoder_diff_spec_alpha=0.0,
    decoder_ssim_alpha=0.0,
    postnet_ssim_alpha=0.0,
)
Example #9
0
config = Tacotron2Config(  # This is the config that is saved for the future use
    audio=audio_config,
    batch_size=40,  # BS of 40 and max length of 10s will use about 20GB of GPU memory
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    r=6,
    gradual_training=[[0, 6, 64], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]],
    double_decoder_consistency=True,
    epochs=1000,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="de",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    precompute_num_workers=8,
    print_step=25,
    print_eval=True,
    mixed_precision=False,
    test_sentences=[
        "Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.",
        "Sei eine Stimme, kein Echo.",
        "Es tut mir Leid David. Das kann ich leider nicht machen.",
        "Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
        "Vor dem 22. November 1963.",
    ],
    # max audio length of 10 seconds, feel free to increase if you got more than 20GB GPU memory
    max_audio_len=22050 * 10,
    output_path=output_path,
    datasets=[dataset_config],
)