Beispiel #1
0
def test_voice_conversion():
    print(" > Run voice conversion inference using YourTTS model.")
    model_name = "tts_models/multilingual/multi-dataset/your_tts"
    language_id = "en"
    speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs",
                               "LJ001-0001.wav")
    reference_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs",
                                 "LJ001-0032.wav")
    output_path = os.path.join(get_tests_output_path(), "output.wav")
    run_cli(
        f"tts --model_name  {model_name}"
        f" --out_path {output_path} --speaker_wav {speaker_wav} --reference_wav {reference_wav} --language_idx {language_id} "
    )
Beispiel #2
0
 def test_d_vector_inference(self):
     args = VitsArgs(
         spec_segment_size=10,
         num_chars=32,
         use_d_vector_file=True,
         d_vector_dim=256,
         d_vector_file=os.path.join(get_tests_data_path(),
                                    "dummy_speakers.json"),
     )
     config = VitsConfig(model_args=args)
     model = Vits.init_from_config(config, verbose=False).to(device)
     model.eval()
     # batch size = 1
     input_dummy = torch.randint(0, 24, (1, 128)).long().to(device)
     d_vectors = torch.randn(1, 256).to(device)
     outputs = model.inference(input_dummy,
                               aux_input={"d_vectors": d_vectors})
     self._check_inference_outputs(config, outputs, input_dummy)
     # batch size = 2
     input_dummy, input_lengths, *_ = self._create_inputs(config)
     d_vectors = torch.randn(2, 256).to(device)
     outputs = model.inference(input_dummy,
                               aux_input={
                                   "x_lengths": input_lengths,
                                   "d_vectors": d_vectors
                               })
     self._check_inference_outputs(config,
                                   outputs,
                                   input_dummy,
                                   batch_size=2)
Beispiel #3
0
 def _test_forward_with_d_vector(self, batch_size):
     input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(
         batch_size)
     d_vector = torch.rand(batch_size, 256).to(device)
     # create model
     config = GlowTTSConfig(
         num_chars=32,
         use_d_vector_file=True,
         d_vector_dim=256,
         d_vector_file=os.path.join(get_tests_data_path(),
                                    "dummy_speakers.json"),
     )
     model = GlowTTS.init_from_config(config, verbose=False).to(device)
     model.train()
     print(" > Num parameters for GlowTTS model:%s" %
           (count_parameters(model)))
     # inference encoder and decoder with MAS
     y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths,
                       {"d_vectors": d_vector})
     self.assertEqual(y["z"].shape, mel_spec.shape)
     self.assertEqual(y["logdet"].shape, torch.Size([batch_size]))
     self.assertEqual(y["y_mean"].shape, mel_spec.shape)
     self.assertEqual(y["y_log_scale"].shape, mel_spec.shape)
     self.assertEqual(y["alignments"].shape,
                      mel_spec.shape[:2] + (input_dummy.shape[1], ))
     self.assertEqual(y["durations_log"].shape, input_dummy.shape + (1, ))
     self.assertEqual(y["total_durations_log"].shape,
                      input_dummy.shape + (1, ))
Beispiel #4
0
 def test_init_multispeaker(self):
     config = GlowTTSConfig(num_chars=32)
     model = GlowTTS(config)
     # speaker embedding with default speaker_embedding_dim
     config.use_speaker_embedding = True
     config.num_speakers = 5
     config.d_vector_dim = None
     model.init_multispeaker(config)
     self.assertEqual(model.c_in_channels, model.hidden_channels_enc)
     # use external speaker embeddings with speaker_embedding_dim = 301
     config = GlowTTSConfig(num_chars=32)
     config.use_d_vector_file = True
     config.d_vector_dim = 301
     model = GlowTTS(config)
     model.init_multispeaker(config)
     self.assertEqual(model.c_in_channels, 301)
     # use speaker embedddings by the provided speaker_manager
     config = GlowTTSConfig(num_chars=32)
     config.use_speaker_embedding = True
     config.speakers_file = os.path.join(get_tests_data_path(), "ljspeech",
                                         "speakers.json")
     speaker_manager = SpeakerManager.init_from_config(config)
     model = GlowTTS(config)
     model.speaker_manager = speaker_manager
     model.init_multispeaker(config)
     self.assertEqual(model.c_in_channels, model.hidden_channels_enc)
     self.assertEqual(model.num_speakers, speaker_manager.num_speakers)
     # use external speaker embeddings by the provided speaker_manager
     config = GlowTTSConfig(num_chars=32)
     config.use_d_vector_file = True
     config.d_vector_dim = 256
     config.d_vector_file = os.path.join(get_tests_data_path(),
                                         "dummy_speakers.json")
     speaker_manager = SpeakerManager.init_from_config(config)
     model = GlowTTS(config)
     model.speaker_manager = speaker_manager
     model.init_multispeaker(config)
     self.assertEqual(model.c_in_channels, speaker_manager.embedding_dim)
     self.assertEqual(model.num_speakers, speaker_manager.num_speakers)
Beispiel #5
0
    def test_init_from_config(self):
        config = GlowTTSConfig(num_chars=32)
        model = GlowTTS.init_from_config(config, verbose=False).to(device)

        config = GlowTTSConfig(num_chars=32, num_speakers=2)
        model = GlowTTS.init_from_config(config, verbose=False).to(device)
        self.assertTrue(model.num_speakers == 2)
        self.assertTrue(not hasattr(model, "emb_g"))

        config = GlowTTSConfig(num_chars=32,
                               num_speakers=2,
                               use_speaker_embedding=True)
        model = GlowTTS.init_from_config(config, verbose=False).to(device)
        self.assertTrue(model.num_speakers == 2)
        self.assertTrue(hasattr(model, "emb_g"))

        config = GlowTTSConfig(
            num_chars=32,
            num_speakers=2,
            use_speaker_embedding=True,
            speakers_file=os.path.join(get_tests_data_path(), "ljspeech",
                                       "speakers.json"),
        )
        model = GlowTTS.init_from_config(config, verbose=False).to(device)
        self.assertTrue(model.num_speakers == 10)
        self.assertTrue(hasattr(model, "emb_g"))

        config = GlowTTSConfig(
            num_chars=32,
            use_d_vector_file=True,
            d_vector_dim=256,
            d_vector_file=os.path.join(get_tests_data_path(),
                                       "dummy_speakers.json"),
        )
        model = GlowTTS.init_from_config(config, verbose=False).to(device)
        self.assertTrue(model.num_speakers == 1)
        self.assertTrue(not hasattr(model, "emb_g"))
        self.assertTrue(model.c_in_channels == config.d_vector_dim)
Beispiel #6
0
    def test_init_from_config(self):
        config = VitsConfig(model_args=VitsArgs(num_chars=32))
        model = Vits.init_from_config(config, verbose=False).to(device)

        config = VitsConfig(model_args=VitsArgs(num_chars=32, num_speakers=2))
        model = Vits.init_from_config(config, verbose=False).to(device)
        self.assertTrue(not hasattr(model, "emb_g"))

        config = VitsConfig(model_args=VitsArgs(
            num_chars=32, num_speakers=2, use_speaker_embedding=True))
        model = Vits.init_from_config(config, verbose=False).to(device)
        self.assertEqual(model.num_speakers, 2)
        self.assertTrue(hasattr(model, "emb_g"))

        config = VitsConfig(model_args=VitsArgs(
            num_chars=32,
            num_speakers=2,
            use_speaker_embedding=True,
            speakers_file=os.path.join(get_tests_data_path(), "ljspeech",
                                       "speakers.json"),
        ))
        model = Vits.init_from_config(config, verbose=False).to(device)
        self.assertEqual(model.num_speakers, 10)
        self.assertTrue(hasattr(model, "emb_g"))

        config = VitsConfig(model_args=VitsArgs(
            num_chars=32,
            use_d_vector_file=True,
            d_vector_dim=256,
            d_vector_file=os.path.join(get_tests_data_path(),
                                       "dummy_speakers.json"),
        ))
        model = Vits.init_from_config(config, verbose=False).to(device)
        self.assertTrue(model.num_speakers == 1)
        self.assertTrue(not hasattr(model, "emb_g"))
        self.assertTrue(model.embedded_speaker_dim == config.d_vector_dim)
Beispiel #7
0
 def _test_inference_with_d_vector(self, batch_size):
     input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids = self._create_inputs(
         batch_size)
     d_vector = torch.rand(batch_size, 256).to(device)
     config = GlowTTSConfig(
         num_chars=32,
         use_d_vector_file=True,
         d_vector_dim=256,
         d_vector_file=os.path.join(get_tests_data_path(),
                                    "dummy_speakers.json"),
     )
     model = GlowTTS.init_from_config(config, verbose=False).to(device)
     model.eval()
     outputs = model.inference(input_dummy, {
         "x_lengths": input_lengths,
         "d_vectors": d_vector
     })
     self._assert_inference_outputs(outputs, input_dummy, mel_spec)
Beispiel #8
0
 def test_d_vector_forward(self):
     batch_size = 2
     args = VitsArgs(
         spec_segment_size=10,
         num_chars=32,
         use_d_vector_file=True,
         d_vector_dim=256,
         d_vector_file=os.path.join(get_tests_data_path(),
                                    "dummy_speakers.json"),
     )
     config = VitsConfig(model_args=args)
     model = Vits.init_from_config(config, verbose=False).to(device)
     model.train()
     input_dummy, input_lengths, _, spec, spec_lengths, waveform = self._create_inputs(
         config, batch_size=batch_size)
     d_vectors = torch.randn(batch_size, 256).to(device)
     output_dict = model.forward(input_dummy,
                                 input_lengths,
                                 spec,
                                 spec_lengths,
                                 waveform,
                                 aux_input={"d_vectors": d_vectors})
     self._check_forward_outputs(config, output_dict)
Beispiel #9
0
from TTS.tts.datasets import TTSDataset, load_tts_samples
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor

# pylint: disable=unused-variable

OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
os.makedirs(OUTPATH, exist_ok=True)

# create a dummy config for testing data loaders.
c = BaseTTSConfig(text_cleaner="english_cleaners",
                  num_loader_workers=0,
                  batch_size=2,
                  use_noise_augment=False)
c.r = 5
c.data_path = os.path.join(get_tests_data_path(), "ljspeech/")
ok_ljspeech = os.path.exists(c.data_path)

dataset_config = BaseDatasetConfig(
    name="ljspeech_test",  # ljspeech_test to multi-speaker
    meta_file_train="metadata.csv",
    meta_file_val=None,
    path=c.data_path,
    language="en",
)

DATA_EXIST = True
if not os.path.exists(c.data_path):
    DATA_EXIST = False

print(" > Dynamic data loader test: {}".format(DATA_EXIST))