Exemple #1
0
 def _test_forward_with_d_vector(self, batch_size):
     input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(
         batch_size)
     d_vector = torch.rand(batch_size, 256).to(device)
     # create model
     config = GlowTTSConfig(
         num_chars=32,
         use_d_vector_file=True,
         d_vector_dim=256,
         d_vector_file=os.path.join(get_tests_data_path(),
                                    "dummy_speakers.json"),
     )
     model = GlowTTS.init_from_config(config, verbose=False).to(device)
     model.train()
     print(" > Num parameters for GlowTTS model:%s" %
           (count_parameters(model)))
     # inference encoder and decoder with MAS
     y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths,
                       {"d_vectors": d_vector})
     self.assertEqual(y["z"].shape, mel_spec.shape)
     self.assertEqual(y["logdet"].shape, torch.Size([batch_size]))
     self.assertEqual(y["y_mean"].shape, mel_spec.shape)
     self.assertEqual(y["y_log_scale"].shape, mel_spec.shape)
     self.assertEqual(y["alignments"].shape,
                      mel_spec.shape[:2] + (input_dummy.shape[1], ))
     self.assertEqual(y["durations_log"].shape, input_dummy.shape + (1, ))
     self.assertEqual(y["total_durations_log"].shape,
                      input_dummy.shape + (1, ))
Exemple #2
0
 def test_unlock_act_norm_layers(self):
     config = GlowTTSConfig(num_chars=32)
     model = GlowTTS(config).to(device)
     model.unlock_act_norm_layers()
     for f in model.decoder.flows:
         if getattr(f, "set_ddi", False):
             self.assertFalse(f.initialized)
Exemple #3
0
 def _test_forward_with_speaker_id(self, batch_size):
     input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(
         batch_size)
     speaker_ids = torch.randint(0, 24, (batch_size, )).long().to(device)
     # create model
     config = GlowTTSConfig(
         num_chars=32,
         use_speaker_embedding=True,
         num_speakers=24,
     )
     model = GlowTTS.init_from_config(config, verbose=False).to(device)
     model.train()
     print(" > Num parameters for GlowTTS model:%s" %
           (count_parameters(model)))
     # inference encoder and decoder with MAS
     y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths,
                       {"speaker_ids": speaker_ids})
     self.assertEqual(y["z"].shape, mel_spec.shape)
     self.assertEqual(y["logdet"].shape, torch.Size([batch_size]))
     self.assertEqual(y["y_mean"].shape, mel_spec.shape)
     self.assertEqual(y["y_log_scale"].shape, mel_spec.shape)
     self.assertEqual(y["alignments"].shape,
                      mel_spec.shape[:2] + (input_dummy.shape[1], ))
     self.assertEqual(y["durations_log"].shape, input_dummy.shape + (1, ))
     self.assertEqual(y["total_durations_log"].shape,
                      input_dummy.shape + (1, ))
Exemple #4
0
    def test_inference():
        input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
        input_lengths = torch.randint(100, 129, (8,)).long().to(device)
        input_lengths[-1] = 128
        mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
        mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
        speaker_ids = torch.randint(0, 5, (8,)).long().to(device)

        # create model
        config = GlowTTSConfig(num_chars=32)
        model = GlowTTS(config).to(device)

        model.eval()
        print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model)))

        # inference encoder and decoder with MAS
        y = model.inference_with_MAS(input_dummy, input_lengths, mel_spec, mel_lengths)

        y2 = model.decoder_inference(mel_spec, mel_lengths)

        assert (
            y2["model_outputs"].shape == y["model_outputs"].shape
        ), "Difference between the shapes of the glowTTS inference with MAS ({}) and the inference using only the decoder ({}) !!".format(
            y["model_outputs"].shape, y2["model_outputs"].shape
        )
Exemple #5
0
 def _test_inference(self, batch_size):
     input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(
         batch_size)
     config = GlowTTSConfig(num_chars=32)
     model = GlowTTS(config).to(device)
     model.eval()
     outputs = model.inference(input_dummy, {"x_lengths": input_lengths})
     self._assert_inference_outputs(outputs, input_dummy, mel_spec)
Exemple #6
0
 def test_test_run(self):
     config = GlowTTSConfig(num_chars=32)
     model = GlowTTS.init_from_config(config, verbose=False).to(device)
     model.run_data_dep_init = False
     model.eval()
     test_figures, test_audios = model.test_run(None)
     self.assertTrue(test_figures is not None)
     self.assertTrue(test_audios is not None)
Exemple #7
0
 def test_load_checkpoint(self):
     chkp_path = os.path.join(get_tests_output_path(),
                              "dummy_glow_tts_checkpoint.pth")
     config = GlowTTSConfig(num_chars=32)
     model = GlowTTS.init_from_config(config, verbose=False).to(device)
     chkp = {}
     chkp["model"] = model.state_dict()
     torch.save(chkp, chkp_path)
     model.load_checkpoint(config, chkp_path)
     self.assertTrue(model.training)
     model.load_checkpoint(config, chkp_path, eval=True)
     self.assertFalse(model.training)
Exemple #8
0
    def test_train_step():
        input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
        input_lengths = torch.randint(100, 129, (8,)).long().to(device)
        input_lengths[-1] = 128
        mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
        mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
        speaker_ids = torch.randint(0, 5, (8,)).long().to(device)

        criterion = GlowTTSLoss()

        # model to train
        config = GlowTTSConfig(num_chars=32)
        model = GlowTTS(config).to(device)

        # reference model to compare model weights
        model_ref = GlowTTS(config).to(device)

        model.train()
        print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model)))

        # pass the state to ref model
        model_ref.load_state_dict(copy.deepcopy(model.state_dict()))

        count = 0
        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
            assert (param - param_ref).sum() == 0, param
            count += 1

        optimizer = optim.Adam(model.parameters(), lr=0.001)
        for _ in range(5):
            optimizer.zero_grad()
            outputs = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths, None)
            loss_dict = criterion(
                outputs["z"],
                outputs["y_mean"],
                outputs["y_log_scale"],
                outputs["logdet"],
                mel_lengths,
                outputs["durations_log"],
                outputs["total_durations_log"],
                input_lengths,
            )
            loss = loss_dict["loss"]
            loss.backward()
            optimizer.step()

        # check parameter changes
        count = 0
        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
                count, param.shape, param, param_ref
            )
            count += 1
Exemple #9
0
    def test_init_from_config(self):
        config = GlowTTSConfig(num_chars=32)
        model = GlowTTS.init_from_config(config, verbose=False).to(device)

        config = GlowTTSConfig(num_chars=32, num_speakers=2)
        model = GlowTTS.init_from_config(config, verbose=False).to(device)
        self.assertTrue(model.num_speakers == 2)
        self.assertTrue(not hasattr(model, "emb_g"))

        config = GlowTTSConfig(num_chars=32,
                               num_speakers=2,
                               use_speaker_embedding=True)
        model = GlowTTS.init_from_config(config, verbose=False).to(device)
        self.assertTrue(model.num_speakers == 2)
        self.assertTrue(hasattr(model, "emb_g"))

        config = GlowTTSConfig(
            num_chars=32,
            num_speakers=2,
            use_speaker_embedding=True,
            speakers_file=os.path.join(get_tests_data_path(), "ljspeech",
                                       "speakers.json"),
        )
        model = GlowTTS.init_from_config(config, verbose=False).to(device)
        self.assertTrue(model.num_speakers == 10)
        self.assertTrue(hasattr(model, "emb_g"))

        config = GlowTTSConfig(
            num_chars=32,
            use_d_vector_file=True,
            d_vector_dim=256,
            d_vector_file=os.path.join(get_tests_data_path(),
                                       "dummy_speakers.json"),
        )
        model = GlowTTS.init_from_config(config, verbose=False).to(device)
        self.assertTrue(model.num_speakers == 1)
        self.assertTrue(not hasattr(model, "emb_g"))
        self.assertTrue(model.c_in_channels == config.d_vector_dim)
Exemple #10
0
 def _test_inference_with_MAS(self, batch_size):
     input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(
         batch_size)
     # create model
     config = GlowTTSConfig(num_chars=32)
     model = GlowTTS(config).to(device)
     model.eval()
     # inference encoder and decoder with MAS
     y = model.inference_with_MAS(input_dummy, input_lengths, mel_spec,
                                  mel_lengths)
     y2 = model.decoder_inference(mel_spec, mel_lengths)
     assert (
         y2["model_outputs"].shape == y["model_outputs"].shape
     ), "Difference between the shapes of the glowTTS inference with MAS ({}) and the inference using only the decoder ({}) !!".format(
         y["model_outputs"].shape, y2["model_outputs"].shape)
Exemple #11
0
 def _test_inference_with_speaker_ids(self, batch_size):
     input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(
         batch_size)
     speaker_ids = torch.randint(0, 24, (batch_size, )).long().to(device)
     # create model
     config = GlowTTSConfig(
         num_chars=32,
         use_speaker_embedding=True,
         num_speakers=24,
     )
     model = GlowTTS.init_from_config(config, verbose=False).to(device)
     outputs = model.inference(input_dummy, {
         "x_lengths": input_lengths,
         "speaker_ids": speaker_ids
     })
     self._assert_inference_outputs(outputs, input_dummy, mel_spec)
Exemple #12
0
 def _test_inference_with_d_vector(self, batch_size):
     input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(
         batch_size)
     d_vector = torch.rand(batch_size, 256).to(device)
     config = GlowTTSConfig(
         num_chars=32,
         use_d_vector_file=True,
         d_vector_dim=256,
         d_vector_file=os.path.join(get_tests_data_path(),
                                    "dummy_speakers.json"),
     )
     model = GlowTTS.init_from_config(config, verbose=False).to(device)
     model.eval()
     outputs = model.inference(input_dummy, {
         "x_lengths": input_lengths,
         "d_vectors": d_vector
     })
     self._assert_inference_outputs(outputs, input_dummy, mel_spec)
Exemple #13
0
 def _test_forward(self, batch_size):
     input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(
         batch_size)
     # create model
     config = GlowTTSConfig(num_chars=32)
     model = GlowTTS(config).to(device)
     model.train()
     print(" > Num parameters for GlowTTS model:%s" %
           (count_parameters(model)))
     # inference encoder and decoder with MAS
     y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths)
     self.assertEqual(y["z"].shape, mel_spec.shape)
     self.assertEqual(y["logdet"].shape, torch.Size([batch_size]))
     self.assertEqual(y["y_mean"].shape, mel_spec.shape)
     self.assertEqual(y["y_log_scale"].shape, mel_spec.shape)
     self.assertEqual(y["alignments"].shape,
                      mel_spec.shape[:2] + (input_dummy.shape[1], ))
     self.assertEqual(y["durations_log"].shape, input_dummy.shape + (1, ))
     self.assertEqual(y["total_durations_log"].shape,
                      input_dummy.shape + (1, ))
Exemple #14
0
 def test_train_step(self):
     batch_size = BATCH_SIZE
     input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(
         batch_size)
     criterion = GlowTTSLoss()
     # model to train
     config = GlowTTSConfig(num_chars=32)
     model = GlowTTS(config).to(device)
     # reference model to compare model weights
     model_ref = GlowTTS(config).to(device)
     model.train()
     print(" > Num parameters for GlowTTS model:%s" %
           (count_parameters(model)))
     # pass the state to ref model
     model_ref.load_state_dict(copy.deepcopy(model.state_dict()))
     count = 0
     for param, param_ref in zip(model.parameters(),
                                 model_ref.parameters()):
         assert (param - param_ref).sum() == 0, param
         count += 1
     optimizer = optim.Adam(model.parameters(), lr=0.001)
     for _ in range(5):
         optimizer.zero_grad()
         outputs = model.forward(input_dummy, input_lengths, mel_spec,
                                 mel_lengths, None)
         loss_dict = criterion(
             outputs["z"],
             outputs["y_mean"],
             outputs["y_log_scale"],
             outputs["logdet"],
             mel_lengths,
             outputs["durations_log"],
             outputs["total_durations_log"],
             input_lengths,
         )
         loss = loss_dict["loss"]
         loss.backward()
         optimizer.step()
     # check parameter changes
     self._check_parameter_changes(model, model_ref)
Exemple #15
0
 def test_train_eval_log(self):
     batch_size = BATCH_SIZE
     input_dummy, input_lengths, mel_spec, mel_lengths, _ = self._create_inputs(
         batch_size)
     batch = {}
     batch["text_input"] = input_dummy
     batch["text_lengths"] = input_lengths
     batch["mel_lengths"] = mel_lengths
     batch["mel_input"] = mel_spec
     batch["d_vectors"] = None
     batch["speaker_ids"] = None
     config = GlowTTSConfig(num_chars=32)
     model = GlowTTS.init_from_config(config, verbose=False).to(device)
     model.run_data_dep_init = False
     model.train()
     logger = TensorboardLogger(log_dir=os.path.join(
         get_tests_output_path(), "dummy_glow_tts_logs"),
                                model_name="glow_tts_test_train_log")
     criterion = model.get_criterion()
     outputs, _ = model.train_step(batch, criterion)
     model.train_log(batch, outputs, logger, None, 1)
     model.eval_log(batch, outputs, logger, None, 1)
     logger.finish()
Exemple #16
0
import torch
from torch import optim

from tests import get_tests_input_path
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
from TTS.tts.layers.losses import GlowTTSLoss
from TTS.tts.models.glow_tts import GlowTTS
from TTS.utils.audio import AudioProcessor

# pylint: disable=unused-variable

torch.manual_seed(1)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

c = GlowTTSConfig()

ap = AudioProcessor(**c.audio)
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")


def count_parameters(model):
    r"""Count number of trainable parameters in a network"""
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


class GlowTTSTrainTest(unittest.TestCase):
    @staticmethod
    def test_train_step():
        input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
        input_lengths = torch.randint(100, 129, (8,)).long().to(device)
Exemple #17
0
# INITIALIZE THE TRAINING CONFIGURATION
# Configure the model. Every config class inherits the BaseTTSConfig.
config = GlowTTSConfig(
    batch_size=32,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1000,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="de",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=25,
    print_eval=False,
    mixed_precision=True,
    test_sentences=[
        "Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.",
        "Sei eine Stimme, kein Echo.",
        "Es tut mir Leid David. Das kann ich leider nicht machen.",
        "Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
        "Vor dem 22. November 1963.",
    ],
    output_path=output_path,
    datasets=[dataset_config],
)

# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
Exemple #18
0
from tests import get_device_id, get_tests_output_path, run_cli
from TTS.tts.configs.glow_tts_config import GlowTTSConfig

config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
output_path = os.path.join(get_tests_output_path(), "train_outputs")

config = GlowTTSConfig(
    batch_size=2,
    eval_batch_size=8,
    num_loader_workers=0,
    num_eval_loader_workers=0,
    text_cleaner="english_cleaners",
    use_phonemes=True,
    use_espeak_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1,
    print_step=1,
    print_eval=True,
    test_sentences=[
        "Be a voice, not an echo.",
    ],
    data_dep_init_steps=1.0,
)
config.audio.do_trim_silence = True
config.audio.trim_db = 60
config.save_json(config_path)

# train the model for one epoch
command_train = (
Exemple #19
0
 def test_init_multispeaker(self):
     config = GlowTTSConfig(num_chars=32)
     model = GlowTTS(config)
     # speaker embedding with default speaker_embedding_dim
     config.use_speaker_embedding = True
     config.num_speakers = 5
     config.d_vector_dim = None
     model.init_multispeaker(config)
     self.assertEqual(model.c_in_channels, model.hidden_channels_enc)
     # use external speaker embeddings with speaker_embedding_dim = 301
     config = GlowTTSConfig(num_chars=32)
     config.use_d_vector_file = True
     config.d_vector_dim = 301
     model = GlowTTS(config)
     model.init_multispeaker(config)
     self.assertEqual(model.c_in_channels, 301)
     # use speaker embedddings by the provided speaker_manager
     config = GlowTTSConfig(num_chars=32)
     config.use_speaker_embedding = True
     config.speakers_file = os.path.join(get_tests_data_path(), "ljspeech",
                                         "speakers.json")
     speaker_manager = SpeakerManager.init_from_config(config)
     model = GlowTTS(config)
     model.speaker_manager = speaker_manager
     model.init_multispeaker(config)
     self.assertEqual(model.c_in_channels, model.hidden_channels_enc)
     self.assertEqual(model.num_speakers, speaker_manager.num_speakers)
     # use external speaker embeddings by the provided speaker_manager
     config = GlowTTSConfig(num_chars=32)
     config.use_d_vector_file = True
     config.d_vector_dim = 256
     config.d_vector_file = os.path.join(get_tests_data_path(),
                                         "dummy_speakers.json")
     speaker_manager = SpeakerManager.init_from_config(config)
     model = GlowTTS(config)
     model.speaker_manager = speaker_manager
     model.init_multispeaker(config)
     self.assertEqual(model.c_in_channels, speaker_manager.embedding_dim)
     self.assertEqual(model.num_speakers, speaker_manager.num_speakers)
Exemple #20
0
# You can also use a simple Dict to define the dataset and pass it to your custom formatter.
dataset_config = BaseDatasetConfig(
    name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
)

# INITIALIZE THE TRAINING CONFIGURATION
# Configure the model. Every config class inherits the BaseTTSConfig.
config = GlowTTSConfig(
    batch_size=32,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1000,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=25,
    print_eval=False,
    mixed_precision=True,
    output_path=output_path,
    datasets=[dataset_config],
)

# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor(**config.audio.to_dict())

# LOAD DATA SAMPLES
Exemple #21
0
                               do_trim_silence=True,
                               trim_db=23.0)

# define model config
config = GlowTTSConfig(
    batch_size=64,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    precompute_num_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1000,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=25,
    print_eval=False,
    mixed_precision=True,
    output_path=output_path,
    datasets=[dataset_config],
    use_speaker_embedding=True,
    min_text_len=0,
    max_text_len=500,
    min_audio_len=0,
    max_audio_len=500000,
)

# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
Exemple #22
0
 def test_get_criterion(self):
     config = GlowTTSConfig(num_chars=32)
     model = GlowTTS.init_from_config(config, verbose=False).to(device)
     criterion = model.get_criterion()
     self.assertTrue(criterion is not None)