Exemple #1
0
 def test_unlock_act_norm_layers(self):
     config = GlowTTSConfig(num_chars=32)
     model = GlowTTS(config).to(device)
     model.unlock_act_norm_layers()
     for f in model.decoder.flows:
         if getattr(f, "set_ddi", False):
             self.assertFalse(f.initialized)
Exemple #2
0
 def _test_inference(self, batch_size):
     input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(
         batch_size)
     config = GlowTTSConfig(num_chars=32)
     model = GlowTTS(config).to(device)
     model.eval()
     outputs = model.inference(input_dummy, {"x_lengths": input_lengths})
     self._assert_inference_outputs(outputs, input_dummy, mel_spec)
Exemple #3
0
 def _test_forward_with_d_vector(self, batch_size):
     input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(
         batch_size)
     d_vector = torch.rand(batch_size, 256).to(device)
     # create model
     config = GlowTTSConfig(
         num_chars=32,
         use_d_vector_file=True,
         d_vector_dim=256,
         d_vector_file=os.path.join(get_tests_data_path(),
                                    "dummy_speakers.json"),
     )
     model = GlowTTS.init_from_config(config, verbose=False).to(device)
     model.train()
     print(" > Num parameters for GlowTTS model:%s" %
           (count_parameters(model)))
     # inference encoder and decoder with MAS
     y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths,
                       {"d_vectors": d_vector})
     self.assertEqual(y["z"].shape, mel_spec.shape)
     self.assertEqual(y["logdet"].shape, torch.Size([batch_size]))
     self.assertEqual(y["y_mean"].shape, mel_spec.shape)
     self.assertEqual(y["y_log_scale"].shape, mel_spec.shape)
     self.assertEqual(y["alignments"].shape,
                      mel_spec.shape[:2] + (input_dummy.shape[1], ))
     self.assertEqual(y["durations_log"].shape, input_dummy.shape + (1, ))
     self.assertEqual(y["total_durations_log"].shape,
                      input_dummy.shape + (1, ))
Exemple #4
0
 def _test_forward_with_speaker_id(self, batch_size):
     input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(
         batch_size)
     speaker_ids = torch.randint(0, 24, (batch_size, )).long().to(device)
     # create model
     config = GlowTTSConfig(
         num_chars=32,
         use_speaker_embedding=True,
         num_speakers=24,
     )
     model = GlowTTS.init_from_config(config, verbose=False).to(device)
     model.train()
     print(" > Num parameters for GlowTTS model:%s" %
           (count_parameters(model)))
     # inference encoder and decoder with MAS
     y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths,
                       {"speaker_ids": speaker_ids})
     self.assertEqual(y["z"].shape, mel_spec.shape)
     self.assertEqual(y["logdet"].shape, torch.Size([batch_size]))
     self.assertEqual(y["y_mean"].shape, mel_spec.shape)
     self.assertEqual(y["y_log_scale"].shape, mel_spec.shape)
     self.assertEqual(y["alignments"].shape,
                      mel_spec.shape[:2] + (input_dummy.shape[1], ))
     self.assertEqual(y["durations_log"].shape, input_dummy.shape + (1, ))
     self.assertEqual(y["total_durations_log"].shape,
                      input_dummy.shape + (1, ))
Exemple #5
0
 def test_test_run(self):
     config = GlowTTSConfig(num_chars=32)
     model = GlowTTS.init_from_config(config, verbose=False).to(device)
     model.run_data_dep_init = False
     model.eval()
     test_figures, test_audios = model.test_run(None)
     self.assertTrue(test_figures is not None)
     self.assertTrue(test_audios is not None)
Exemple #6
0
 def _test_forward(self, batch_size):
     input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(
         batch_size)
     # create model
     config = GlowTTSConfig(num_chars=32)
     model = GlowTTS(config).to(device)
     model.train()
     print(" > Num parameters for GlowTTS model:%s" %
           (count_parameters(model)))
     # inference encoder and decoder with MAS
     y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths)
     self.assertEqual(y["z"].shape, mel_spec.shape)
     self.assertEqual(y["logdet"].shape, torch.Size([batch_size]))
     self.assertEqual(y["y_mean"].shape, mel_spec.shape)
     self.assertEqual(y["y_log_scale"].shape, mel_spec.shape)
     self.assertEqual(y["alignments"].shape,
                      mel_spec.shape[:2] + (input_dummy.shape[1], ))
     self.assertEqual(y["durations_log"].shape, input_dummy.shape + (1, ))
     self.assertEqual(y["total_durations_log"].shape,
                      input_dummy.shape + (1, ))
Exemple #7
0
 def test_load_checkpoint(self):
     chkp_path = os.path.join(get_tests_output_path(),
                              "dummy_glow_tts_checkpoint.pth")
     config = GlowTTSConfig(num_chars=32)
     model = GlowTTS.init_from_config(config, verbose=False).to(device)
     chkp = {}
     chkp["model"] = model.state_dict()
     torch.save(chkp, chkp_path)
     model.load_checkpoint(config, chkp_path)
     self.assertTrue(model.training)
     model.load_checkpoint(config, chkp_path, eval=True)
     self.assertFalse(model.training)
Exemple #8
0
    def test_inference():
        input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
        input_lengths = torch.randint(100, 129, (8,)).long().to(device)
        input_lengths[-1] = 128
        mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
        mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
        speaker_ids = torch.randint(0, 5, (8,)).long().to(device)

        # create model
        config = GlowTTSConfig(num_chars=32)
        model = GlowTTS(config).to(device)

        model.eval()
        print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model)))

        # inference encoder and decoder with MAS
        y = model.inference_with_MAS(input_dummy, input_lengths, mel_spec, mel_lengths)

        y2 = model.decoder_inference(mel_spec, mel_lengths)

        assert (
            y2["model_outputs"].shape == y["model_outputs"].shape
        ), "Difference between the shapes of the glowTTS inference with MAS ({}) and the inference using only the decoder ({}) !!".format(
            y["model_outputs"].shape, y2["model_outputs"].shape
        )
Exemple #9
0
    def test_init_from_config(self):
        config = GlowTTSConfig(num_chars=32)
        model = GlowTTS.init_from_config(config, verbose=False).to(device)

        config = GlowTTSConfig(num_chars=32, num_speakers=2)
        model = GlowTTS.init_from_config(config, verbose=False).to(device)
        self.assertTrue(model.num_speakers == 2)
        self.assertTrue(not hasattr(model, "emb_g"))

        config = GlowTTSConfig(num_chars=32,
                               num_speakers=2,
                               use_speaker_embedding=True)
        model = GlowTTS.init_from_config(config, verbose=False).to(device)
        self.assertTrue(model.num_speakers == 2)
        self.assertTrue(hasattr(model, "emb_g"))

        config = GlowTTSConfig(
            num_chars=32,
            num_speakers=2,
            use_speaker_embedding=True,
            speakers_file=os.path.join(get_tests_data_path(), "ljspeech",
                                       "speakers.json"),
        )
        model = GlowTTS.init_from_config(config, verbose=False).to(device)
        self.assertTrue(model.num_speakers == 10)
        self.assertTrue(hasattr(model, "emb_g"))

        config = GlowTTSConfig(
            num_chars=32,
            use_d_vector_file=True,
            d_vector_dim=256,
            d_vector_file=os.path.join(get_tests_data_path(),
                                       "dummy_speakers.json"),
        )
        model = GlowTTS.init_from_config(config, verbose=False).to(device)
        self.assertTrue(model.num_speakers == 1)
        self.assertTrue(not hasattr(model, "emb_g"))
        self.assertTrue(model.c_in_channels == config.d_vector_dim)
Exemple #10
0
 def _test_inference_with_speaker_ids(self, batch_size):
     input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(
         batch_size)
     speaker_ids = torch.randint(0, 24, (batch_size, )).long().to(device)
     # create model
     config = GlowTTSConfig(
         num_chars=32,
         use_speaker_embedding=True,
         num_speakers=24,
     )
     model = GlowTTS.init_from_config(config, verbose=False).to(device)
     outputs = model.inference(input_dummy, {
         "x_lengths": input_lengths,
         "speaker_ids": speaker_ids
     })
     self._assert_inference_outputs(outputs, input_dummy, mel_spec)
    def test_inference():
        input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
        input_lengths = torch.randint(100, 129, (8, )).long().to(device)
        input_lengths[-1] = 128
        mel_spec = torch.rand(8, c.audio["num_mels"], 30).to(device)
        mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
        speaker_ids = torch.randint(0, 5, (8, )).long().to(device)

        # create model
        model = GlowTTS(
            num_chars=32,
            hidden_channels_enc=48,
            hidden_channels_dec=48,
            hidden_channels_dp=32,
            out_channels=80,
            encoder_type="rel_pos_transformer",
            encoder_params={
                "kernel_size": 3,
                "dropout_p": 0.1,
                "num_layers": 6,
                "num_heads": 2,
                "hidden_channels_ffn": 16,  # 4 times the hidden_channels
                "input_length": None,
            },
            use_encoder_prenet=True,
            num_flow_blocks_dec=12,
            kernel_size_dec=5,
            dilation_rate=1,
            num_block_layers=4,
            dropout_p_dec=0.0,
            num_speakers=0,
            c_in_channels=0,
            num_splits=4,
            num_squeeze=1,
            sigmoid_scale=False,
            mean_only=False,
        ).to(device)

        model.eval()
        print(" > Num parameters for GlowTTS model:%s" %
              (count_parameters(model)))

        # inference encoder and decoder with MAS
        y, *_ = model.inference_with_MAS(input_dummy, input_lengths, mel_spec,
                                         mel_lengths, None)

        y_dec, _ = model.decoder_inference(mel_spec, mel_lengths)

        assert (
            y_dec.shape == y.shape
        ), "Difference between the shapes of the glowTTS inference with MAS ({}) and the inference using only the decoder ({}) !!".format(
            y.shape, y_dec.shape)
Exemple #12
0
 def _test_inference_with_d_vector(self, batch_size):
     input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(
         batch_size)
     d_vector = torch.rand(batch_size, 256).to(device)
     config = GlowTTSConfig(
         num_chars=32,
         use_d_vector_file=True,
         d_vector_dim=256,
         d_vector_file=os.path.join(get_tests_data_path(),
                                    "dummy_speakers.json"),
     )
     model = GlowTTS.init_from_config(config, verbose=False).to(device)
     model.eval()
     outputs = model.inference(input_dummy, {
         "x_lengths": input_lengths,
         "d_vectors": d_vector
     })
     self._assert_inference_outputs(outputs, input_dummy, mel_spec)
Exemple #13
0
 def _test_inference_with_MAS(self, batch_size):
     input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(
         batch_size)
     # create model
     config = GlowTTSConfig(num_chars=32)
     model = GlowTTS(config).to(device)
     model.eval()
     # inference encoder and decoder with MAS
     y = model.inference_with_MAS(input_dummy, input_lengths, mel_spec,
                                  mel_lengths)
     y2 = model.decoder_inference(mel_spec, mel_lengths)
     assert (
         y2["model_outputs"].shape == y["model_outputs"].shape
     ), "Difference between the shapes of the glowTTS inference with MAS ({}) and the inference using only the decoder ({}) !!".format(
         y["model_outputs"].shape, y2["model_outputs"].shape)
Exemple #14
0
 def test_train_eval_log(self):
     batch_size = BATCH_SIZE
     input_dummy, input_lengths, mel_spec, mel_lengths, _ = self._create_inputs(
         batch_size)
     batch = {}
     batch["text_input"] = input_dummy
     batch["text_lengths"] = input_lengths
     batch["mel_lengths"] = mel_lengths
     batch["mel_input"] = mel_spec
     batch["d_vectors"] = None
     batch["speaker_ids"] = None
     config = GlowTTSConfig(num_chars=32)
     model = GlowTTS.init_from_config(config, verbose=False).to(device)
     model.run_data_dep_init = False
     model.train()
     logger = TensorboardLogger(log_dir=os.path.join(
         get_tests_output_path(), "dummy_glow_tts_logs"),
                                model_name="glow_tts_test_train_log")
     criterion = model.get_criterion()
     outputs, _ = model.train_step(batch, criterion)
     model.train_log(batch, outputs, logger, None, 1)
     model.eval_log(batch, outputs, logger, None, 1)
     logger.finish()
Exemple #15
0
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)

# init speaker manager for multi-speaker training
# it maps speaker-id to speaker-name in the model and data-loader
speaker_manager = SpeakerManager()
speaker_manager.set_ids_from_data(train_samples + eval_samples,
                                  parse_key="speaker_name")
config.num_speakers = speaker_manager.num_speakers

# init model
model = GlowTTS(config, ap, tokenizer, speaker_manager=speaker_manager)

# INITIALIZE THE TRAINER
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
# distributed training, etc.
trainer = Trainer(TrainerArgs(),
                  config,
                  output_path,
                  model=model,
                  train_samples=train_samples,
                  eval_samples=eval_samples)

# AND... 3,2,1... 🚀
trainer.fit()
Exemple #16
0
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor(**config.audio.to_dict())

# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)

# INITIALIZE THE MODEL
# Models take a config object and a speaker manager as input
# Config defines the details of the model like the number of layers, the size of the embedding, etc.
# Speaker manager is used by multi-speaker models.
model = GlowTTS(config, speaker_manager=None)

# INITIALIZE THE TRAINER
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
# distributed training, etc.
trainer = Trainer(
    TrainingArgs(),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
    training_assets={"audio_processor": ap},  # assets are objetcs used by the models but not class members.
)

# AND... 3,2,1... 🚀
Exemple #17
0
    def test_train_step():
        input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
        input_lengths = torch.randint(100, 129, (8,)).long().to(device)
        input_lengths[-1] = 128
        mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device)
        mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
        speaker_ids = torch.randint(0, 5, (8,)).long().to(device)

        criterion = GlowTTSLoss()

        # model to train
        config = GlowTTSConfig(num_chars=32)
        model = GlowTTS(config).to(device)

        # reference model to compare model weights
        model_ref = GlowTTS(config).to(device)

        model.train()
        print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model)))

        # pass the state to ref model
        model_ref.load_state_dict(copy.deepcopy(model.state_dict()))

        count = 0
        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
            assert (param - param_ref).sum() == 0, param
            count += 1

        optimizer = optim.Adam(model.parameters(), lr=0.001)
        for _ in range(5):
            optimizer.zero_grad()
            outputs = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths, None)
            loss_dict = criterion(
                outputs["z"],
                outputs["y_mean"],
                outputs["y_log_scale"],
                outputs["logdet"],
                mel_lengths,
                outputs["durations_log"],
                outputs["total_durations_log"],
                input_lengths,
            )
            loss = loss_dict["loss"]
            loss.backward()
            optimizer.step()

        # check parameter changes
        count = 0
        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
                count, param.shape, param, param_ref
            )
            count += 1
    def test_train_step():
        input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
        input_lengths = torch.randint(100, 129, (8, )).long().to(device)
        input_lengths[-1] = 128
        mel_spec = torch.rand(8, c.audio["num_mels"], 30).to(device)
        mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
        speaker_ids = torch.randint(0, 5, (8, )).long().to(device)

        criterion = GlowTTSLoss()

        # model to train
        model = GlowTTS(
            num_chars=32,
            hidden_channels_enc=48,
            hidden_channels_dec=48,
            hidden_channels_dp=32,
            out_channels=80,
            encoder_type="rel_pos_transformer",
            encoder_params={
                "kernel_size": 3,
                "dropout_p": 0.1,
                "num_layers": 6,
                "num_heads": 2,
                "hidden_channels_ffn": 16,  # 4 times the hidden_channels
                "input_length": None,
            },
            use_encoder_prenet=True,
            num_flow_blocks_dec=12,
            kernel_size_dec=5,
            dilation_rate=1,
            num_block_layers=4,
            dropout_p_dec=0.0,
            num_speakers=0,
            c_in_channels=0,
            num_splits=4,
            num_squeeze=1,
            sigmoid_scale=False,
            mean_only=False,
        ).to(device)

        # reference model to compare model weights
        model_ref = GlowTTS(
            num_chars=32,
            hidden_channels_enc=48,
            hidden_channels_dec=48,
            hidden_channels_dp=32,
            out_channels=80,
            encoder_type="rel_pos_transformer",
            encoder_params={
                "kernel_size": 3,
                "dropout_p": 0.1,
                "num_layers": 6,
                "num_heads": 2,
                "hidden_channels_ffn": 16,  # 4 times the hidden_channels
                "input_length": None,
            },
            use_encoder_prenet=True,
            num_flow_blocks_dec=12,
            kernel_size_dec=5,
            dilation_rate=1,
            num_block_layers=4,
            dropout_p_dec=0.0,
            num_speakers=0,
            c_in_channels=0,
            num_splits=4,
            num_squeeze=1,
            sigmoid_scale=False,
            mean_only=False,
        ).to(device)

        model.train()
        print(" > Num parameters for GlowTTS model:%s" %
              (count_parameters(model)))

        # pass the state to ref model
        model_ref.load_state_dict(copy.deepcopy(model.state_dict()))

        count = 0
        for param, param_ref in zip(model.parameters(),
                                    model_ref.parameters()):
            assert (param - param_ref).sum() == 0, param
            count += 1

        optimizer = optim.Adam(model.parameters(), lr=0.001)
        for _ in range(5):
            optimizer.zero_grad()
            z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward(
                input_dummy, input_lengths, mel_spec, mel_lengths, None)
            loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths,
                                  o_dur_log, o_total_dur, input_lengths)
            loss = loss_dict["loss"]
            loss.backward()
            optimizer.step()

        # check parameter changes
        count = 0
        for param, param_ref in zip(model.parameters(),
                                    model_ref.parameters()):
            assert (param != param_ref).any(
            ), "param {} with shape {} not updated!! \n{}\n{}".format(
                count, param.shape, param, param_ref)
            count += 1
Exemple #19
0
 def test_init_multispeaker(self):
     config = GlowTTSConfig(num_chars=32)
     model = GlowTTS(config)
     # speaker embedding with default speaker_embedding_dim
     config.use_speaker_embedding = True
     config.num_speakers = 5
     config.d_vector_dim = None
     model.init_multispeaker(config)
     self.assertEqual(model.c_in_channels, model.hidden_channels_enc)
     # use external speaker embeddings with speaker_embedding_dim = 301
     config = GlowTTSConfig(num_chars=32)
     config.use_d_vector_file = True
     config.d_vector_dim = 301
     model = GlowTTS(config)
     model.init_multispeaker(config)
     self.assertEqual(model.c_in_channels, 301)
     # use speaker embedddings by the provided speaker_manager
     config = GlowTTSConfig(num_chars=32)
     config.use_speaker_embedding = True
     config.speakers_file = os.path.join(get_tests_data_path(), "ljspeech",
                                         "speakers.json")
     speaker_manager = SpeakerManager.init_from_config(config)
     model = GlowTTS(config)
     model.speaker_manager = speaker_manager
     model.init_multispeaker(config)
     self.assertEqual(model.c_in_channels, model.hidden_channels_enc)
     self.assertEqual(model.num_speakers, speaker_manager.num_speakers)
     # use external speaker embeddings by the provided speaker_manager
     config = GlowTTSConfig(num_chars=32)
     config.use_d_vector_file = True
     config.d_vector_dim = 256
     config.d_vector_file = os.path.join(get_tests_data_path(),
                                         "dummy_speakers.json")
     speaker_manager = SpeakerManager.init_from_config(config)
     model = GlowTTS(config)
     model.speaker_manager = speaker_manager
     model.init_multispeaker(config)
     self.assertEqual(model.c_in_channels, speaker_manager.embedding_dim)
     self.assertEqual(model.num_speakers, speaker_manager.num_speakers)
Exemple #20
0
 def test_get_criterion(self):
     config = GlowTTSConfig(num_chars=32)
     model = GlowTTS.init_from_config(config, verbose=False).to(device)
     criterion = model.get_criterion()
     self.assertTrue(criterion is not None)
Exemple #21
0
 def test_train_step(self):
     batch_size = BATCH_SIZE
     input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(
         batch_size)
     criterion = GlowTTSLoss()
     # model to train
     config = GlowTTSConfig(num_chars=32)
     model = GlowTTS(config).to(device)
     # reference model to compare model weights
     model_ref = GlowTTS(config).to(device)
     model.train()
     print(" > Num parameters for GlowTTS model:%s" %
           (count_parameters(model)))
     # pass the state to ref model
     model_ref.load_state_dict(copy.deepcopy(model.state_dict()))
     count = 0
     for param, param_ref in zip(model.parameters(),
                                 model_ref.parameters()):
         assert (param - param_ref).sum() == 0, param
         count += 1
     optimizer = optim.Adam(model.parameters(), lr=0.001)
     for _ in range(5):
         optimizer.zero_grad()
         outputs = model.forward(input_dummy, input_lengths, mel_spec,
                                 mel_lengths, None)
         loss_dict = criterion(
             outputs["z"],
             outputs["y_mean"],
             outputs["y_log_scale"],
             outputs["logdet"],
             mel_lengths,
             outputs["durations_log"],
             outputs["total_durations_log"],
             input_lengths,
         )
         loss = loss_dict["loss"]
         loss.backward()
         optimizer.step()
     # check parameter changes
     self._check_parameter_changes(model, model_ref)