Beispiel #1
0
 def load_models(self):
     if not torch.cuda.is_available():
             print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
                   "for deep learning, ensure that the drivers are properly installed, and that your "
                   "CUDA version matches your PyTorch installation. CPU-only inference is currently "
                   "not supported.", file=sys.stderr)
             quit(-1)
     device_id = torch.cuda.current_device()
     gpu_properties = torch.cuda.get_device_properties(device_id)
     print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
           "%.1fGb total memory.\n" % 
           (torch.cuda.device_count(),
            device_id,
            gpu_properties.name,
            gpu_properties.major,
            gpu_properties.minor,
            gpu_properties.total_memory / 1e9))
 
 
     ## Load the models one by one.
     print("Preparing the encoder, the synthesizer and the vocoder...")
     encoder.load_model(self.enc_model_fpath)
     print("Loaded Encoder")
     self.synthesizer = Synthesizer(self.syn_model_dir.joinpath("taco_pretrained"), low_mem=self.low_mem)
     print("Loaded Synth")
     vocoder.load_model(self.voc_model_fpath)
     print("Loaded Vocoder")
Beispiel #2
0
    def synthesize(self):
        self.ui.log("Generating the mel spectrogram...")
        self.ui.set_loading(1)

        # Synthesize the spectrogram
        if self.synthesizer is None:
            model_dir = self.ui.current_synthesizer_model_dir
            checkpoints_dir = model_dir.joinpath("taco_pretrained")
            self.synthesizer = Synthesizer(checkpoints_dir,
                                           low_mem=self.low_mem)
        if not self.synthesizer.is_loaded():
            self.ui.log("Loading the synthesizer %s" %
                        self.synthesizer.checkpoint_fpath)

        texts = self.ui.text_prompt.toPlainText().split("\n")
        embed = self.ui.selected_utterance.embed
        embeds = np.stack([embed] * len(texts))
        specs = self.synthesizer.synthesize_spectrograms(texts, embeds)
        breaks = [spec.shape[1] for spec in specs]
        spec = np.concatenate(specs, axis=1)

        self.ui.draw_spec(spec, "generated")
        self.current_generated = (self.ui.selected_utterance.speaker_name,
                                  spec, breaks, None)
        self.ui.set_loading(0)
Beispiel #3
0
    def synthesize(self):
        self.ui.log("Generating the mel spectrogram...")
        self.ui.set_loading(1)

        # Synthesize the spectrogram
        if self.synthesizer is None:
            model_dir = self.ui.current_synthesizer_model_dir
            checkpoints_dir = model_dir.joinpath("checkpoints")
            self.synthesizer = Synthesizer(checkpoints_dir, low_mem=self.low_mem)
        if not self.synthesizer.is_loaded():
            self.ui.log("Loading the synthesizer %s" % self.synthesizer.checkpoint_fpath)

        ptext = self.ui.text_prompt.toPlainText()
        texts = ptext.split("\n")

        embed = self.ui.selected_utterance.embed
        embeds = np.stack([embed] * len(texts))
        specs, aligns = self.synthesizer.synthesize_spectrograms(texts, embeds, return_alignments=True)

        breaks = [spec.shape[1] for spec in specs]
        spec = np.concatenate(specs, axis=1)
        align = np.concatenate(aligns, axis=1)

        fref = self.ui.selected_utterance.name
        ftext = '。'.join(texts)
        ftime = '{}'.format(time_formatter())
        fname = filename_formatter('{}_{}_{}zi_{}.npy'.format(fref, ftime, len(ftext), ftext))
        np.save(self._out_mel_dir.joinpath(fname), spec, allow_pickle=False)  # save

        self.ui.draw_spec(spec, "generated")
        self.ui.draw_align(align, "generated")
        self.current_generated = (self.ui.selected_utterance.speaker_name, spec, breaks, None)
        self.ui.set_loading(0)
Beispiel #4
0
    def __init__(self):
        if (Text2SpeechModel == "dc_tts"):
            self.g = Graph(mode="synthesize")
            print("Text2Speech Tensorflow Graph loaded")
        elif (Text2SpeechModel == "RTVC"):
            enc_model_fpath = os.path.join(
                root_file_path, "RTVC", "encoder/saved_models/pretrained.pt")
            syn_model_dir = os.path.join(
                root_file_path, "RTVC",
                "synthesizer/saved_models/logs-pretrained")
            voc_model_fpath = os.path.join(
                root_file_path, "RTVC",
                "vocoder/saved_models/pretrained/pretrained.pt")
            encoder.load_model(enc_model_fpath)
            self.synthesizer = Synthesizer(os.path.join(
                syn_model_dir, "taco_pretrained"),
                                           low_mem=False)
            vocoder.load_model(voc_model_fpath)
            in_fpath = os.path.join("/",
                                    *root_file_path.split("/")[:-1],
                                    "REF/refaudioRTVC/ref.wav")
            preprocessed_wav = encoder.preprocess_wav(in_fpath)
            original_wav, sampling_rate = librosa.load(in_fpath)
            preprocessed_wav = encoder.preprocess_wav(original_wav,
                                                      sampling_rate)
            embed = encoder.embed_utterance(preprocessed_wav)
            self.embeds = [embed]
        elif (Text2SpeechModel == "AudioSynth"):
            taco_pretrained_config_path = os.path.join(
                root_file_path,
                'AudioSynth/TensorFlowTTS/examples/tacotron2/conf/tacotron2.v1.yaml'
            )
            tacotron2_config = AutoConfig.from_pretrained(
                taco_pretrained_config_path)
            taco_path = os.path.join(root_file_path,
                                     "AudioSynth/tacotron2-120k.h5")
            self.tacotron2 = TFAutoModel.from_pretrained(
                config=tacotron2_config,
                pretrained_path=taco_path,
                training=False,
                name="tacotron2")

            melgan_stft_pretrained_config_path = os.path.join(
                root_file_path,
                'AudioSynth/TensorFlowTTS/examples/melgan.stft/conf/melgan.stft.v1.yaml'
            )
            melgan_stft_config = AutoConfig.from_pretrained(
                melgan_stft_pretrained_config_path)
            melgan_stft_path = os.path.join(root_file_path,
                                            "AudioSynth/melgan.stft-2M.h5")
            self.melgan_stft = TFAutoModel.from_pretrained(
                config=melgan_stft_config,
                pretrained_path=melgan_stft_path,
                name="melgan_stft")
            self.processor = AutoProcessor.from_pretrained(
                pretrained_path=os.path.join(
                    root_file_path, "AudioSynth/ljspeech_mapper.json"))
            mels, alignment_history, audios = do_synthesis(
                "Hello, how can I help you today?", self.tacotron2,
                self.melgan_stft, "TACOTRON", "MELGAN-STFT", self.processor)
Beispiel #5
0
 def __init__(self, encoder_model_path, synthesizer_model_path,
              vocoder_model_path):
     print("Preparing the encoder, the synthesizer and the vocoder...")
     self.encoder = encoder
     # self.encoder.load_model(Path(encoder_model_path))
     self.synthesizer = Synthesizer(
         Path(synthesizer_model_path).joinpath("taco_pretrained"),
         low_mem=False)
     self.vocoder = vocoder
     self.vocoder.load_model(Path(vocoder_model_path))
Beispiel #6
0
    def __init__(self):
        # Info & args
        enc_model_fpath = Path("encoder/saved_models/pretrained.pt")

        syn_model_dir = Path("synthesizer/saved_models/logs-pretrained/")
        voc_model_fpath = Path("vocoder/saved_models/pretrained/pretrained.pt")
        low_mem = False

        ## Load the models one by one.
        print("Preparing the encoder, the synthesizer and the vocoder...")
        encoder.load_model(enc_model_fpath)
        self.synthesizer = Synthesizer(syn_model_dir.joinpath("taco_pretrained"), low_mem=low_mem)
        vocoder.load_model(voc_model_fpath)
Beispiel #7
0
def DeepTalk_synthesizer(encoder_embedding,
                         output_text,
                         model_save_path,
                         low_mem=False):
    synthesizer = Synthesizer(model_save_path, low_mem=low_mem)
    texts = output_text
    texts = texts.split("\n")
    embeds = np.stack([encoder_embedding] * len(texts))
    specs = synthesizer.synthesize_spectrograms(texts, embeds)
    breaks = [spec.shape[1] for spec in specs]
    spec = np.concatenate(specs, axis=1)
    mel = spec

    return mel, breaks
Beispiel #8
0
    def load_from_browser(self, fpath=None):
        if fpath is None:
            fpath = Path(self.datasets_root,
                         self.ui.current_dataset_name,
                         self.ui.current_speaker_name,
                         self.ui.current_utterance_name)
            name = str(fpath.relative_to(self.datasets_root))
            speaker_name = self.ui.current_dataset_name + '_' + self.ui.current_speaker_name
            
            # Select the next utterance
            if self.ui.auto_next_checkbox.isChecked():
                self.ui.browser_select_next()
        elif fpath == "":
            return 
        else:
            name = fpath.name
            speaker_name = fpath.parent.name

        if fpath.suffix.lower() == ".mp3" and self.no_mp3_support:
                self.ui.log("Error: No mp3 file argument was passed but an mp3 file was used")
                return

        # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
        # playback, so as to have a fair comparison with the generated audio
        wav = Synthesizer.load_preprocess_wav(fpath)
        self.ui.log("Loaded %s" % name)

        self.add_real_utterance(wav, name, speaker_name)
Beispiel #9
0
    def load_from_browser(self, fpath=None):
        if fpath is None:
            fpath = Path(self.datasets_root, self.ui.current_dataset_name,
                         self.ui.current_speaker_name,
                         self.ui.current_utterance_name)

            if (str(self.datasets_root)[0] == '/'
                    or str(self.datasets_root)[1] == ':'):
                name = str(fpath.relative_to(self.datasets_root))
            else:
                name = os.getcwd() + '/' + str(fpath)
            speaker_name = self.ui.current_dataset_name + '_' + self.ui.current_speaker_name

            # Select the next utterance
            if self.ui.auto_next_checkbox.isChecked():
                self.ui.browser_select_next()
        elif fpath == "":
            return
        else:
            name = str(fpath).replace('\\', '/')
            speaker_name = 'Custom'

        # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
        # playback, so as to have a fair comparison with the generated audio
        wav = Synthesizer.load_preprocess_wav(name)
        self.ui.log("Loaded %s" % name)
        self.filename = os.path.basename(name)
        self.add_real_utterance(wav, name, speaker_name)
Beispiel #10
0
def DeepTalk_encoder(file_path,
                     model_save_path,
                     module_name,
                     preprocess=True,
                     normalize=True,
                     sampling_rate=8000,
                     duration=None):

    encoder.load_model(model_save_path, module_name=module_name)

    if (preprocess):
        wav = Synthesizer.load_preprocess_wav(file_path)
        ref_audio = encoder.preprocess_wav(wav)
    else:
        ref_audio, sr = librosa.load(file_path, sr=sampling_rate)

    if (duration is not None):
        ref_audio = ref_audio[0:int(duration * sampling_rate)]

    embed, partial_embeds, _ = encoder.embed_utterance(ref_audio,
                                                       using_partials=True,
                                                       return_partials=True)

    if (normalize):
        embed = embed / np.linalg.norm(embed)

    return embed
Beispiel #11
0
    def vocode(self):
        speaker_name, spec, breaks, _ = self.current_generated
        assert spec is not None

        # Synthesize the waveform
        if not vocoder.is_loaded():
            self.init_vocoder()

        def vocoder_progress(i, seq_len, b_size, gen_rate):
            real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
            line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
                   % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
            self.ui.log(line, "overwrite")
            self.ui.set_loading(i, seq_len)

        if self.ui.current_vocoder_fpath is not None:
            self.ui.log("")
            wav = vocoder.infer_waveform(spec,
                                         progress_callback=vocoder_progress)
        else:
            self.ui.log("Waveform generation with Griffin-Lim... ")
            wav = Synthesizer.griffin_lim(spec)
        self.ui.set_loading(0)
        self.ui.log(" Done!", "append")

        # Add breaks
        b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
        b_starts = np.concatenate(([0], b_ends[:-1]))
        wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
        breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
        wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])

        # Play it
        wav = wav / np.abs(wav).max() * 0.97
        self.ui.play(wav, Synthesizer.sample_rate)

        self.ui.save_button.setDisabled(False)

        # Compute the embedding
        # TODO: this is problematic with different sampling rates, gotta fix it
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(
            encoder_wav, return_partials=True)

        # Add the utterance
        if not speaker_name is None:
            name = speaker_name
        else:
            name = "unknown"
        name = name + "_gen_%05d" % np.random.randint(100000)
        utterance = Utterance(name, speaker_name, wav, spec, embed,
                              partial_embeds, True)
        self.utterances.add(utterance)
        self.ui.register_utterance(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "generated")
        self.ui.draw_umap_projections(self.utterances)
Beispiel #12
0
def load_model(in_fpath, parser):

	parser.add_argument("-e", "--enc_model_fpath", type=Path, 
		        default="encoder/saved_models/pretrained.pt",
		        help="Path to a saved encoder")
	parser.add_argument("-s", "--syn_model_dir", type=Path, 
		        default="synthesizer/saved_models/logs-pretrained/",
		        help="Directory containing the synthesizer model")
	parser.add_argument("-v", "--voc_model_fpath", type=Path, 
		        default="vocoder/saved_models/pretrained/pretrained.pt",
		        help="Path to a saved vocoder")
	parser.add_argument("--low_mem", action="store_true", help=\
	"If True, the memory used by the synthesizer will be freed after each use. Adds large "
	"overhead but allows to save some GPU memory for lower-end GPUs.")
	parser.add_argument("--no_sound", action="store_true", help=\
	"If True, audio won't be played.")
	args = parser.parse_args()
	encoder.load_model(args.enc_model_fpath)
	synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem)
	vocoder.load_model(args.voc_model_fpath)

	preprocessed_wav = encoder.preprocess_wav(in_fpath)
	original_wav, sampling_rate = librosa.load(in_fpath)
	preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
	embed = encoder.embed_utterance(preprocessed_wav)
	
	return synthesizer, sampling_rate, embed
Beispiel #13
0
    def vocode(self):
        speaker_name, spec, breaks, _ = self.current_generated
        assert spec is not None

        # Synthesize the waveform
        if not vocoder.is_loaded():
            self.init_vocoder()

        def vocoder_progress(i, seq_len, b_size, gen_rate):
            real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
            line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
                   % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
            self.ui.log(line, "overwrite")
            self.ui.set_loading(i, seq_len)

        if self.ui.current_vocoder_fpath is not None:
            self.ui.log("")
            wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress)
        else:
            self.ui.log("Waveform generation with Griffin-Lim... ")
            wav = Synthesizer.griffin_lim(spec)
        self.ui.set_loading(0)
        self.ui.log(" Done!", "append")

        # Add breaks
        b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
        b_starts = np.concatenate(([0], b_ends[:-1]))
        wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
        breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
        wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])

        # Play it
        wav = wav / np.abs(wav).max() * 0.97
        self.ui.play(wav, Synthesizer.sample_rate)

        fref = '-'.join([self.ui.current_dataset_name, self.ui.current_speaker_name, self.ui.current_utterance_name])
        ftime = '{}'.format(int(time.time()))
        ftext = self.ui.text_prompt.toPlainText()
        fms = int(len(wav) * 1000 / Synthesizer.sample_rate)
        fname = filename_formatter('{}_{}_{}ms_{}.wav'.format(fref, ftime, fms, ftext))
        audio.save_wav(wav, _out_wav_dir.joinpath(fname), Synthesizer.sample_rate)  # save

        # Compute the embedding
        # TODO: this is problematic with different sampling rates, gotta fix it
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)

        # Add the utterance
        name = speaker_name + "_gen_%05d" % int(time.time())
        utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True)

        np.save(_out_embed_dir.joinpath(name + '.npy'), embed, allow_pickle=False)  # save

        self.utterances.add(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "generated")
        self.ui.draw_umap_projections(self.utterances)
Beispiel #14
0
    def load_from_browser(self, fpath=None):
        if fpath is None:
            fpath = Path(self.datasets_root,
                         self.ui.current_dataset_name,
                         self.ui.current_speaker_name,
                         self.ui.current_utterance_name)
            # name = '/'.join(fpath.relative_to(self.datasets_root).parts)
            dat = self.ui.current_dataset_name.replace("\\", "#").replace("/", "#")
            spk = self.ui.current_speaker_name.replace("\\", "#").replace("/", "#")
            aud = self.ui.current_utterance_name.replace("\\", "#").replace("/", "#")
            speaker_name = "#".join((dat, spk))
            name = "#".join((speaker_name, aud))
            # name = '-'.join(fpath.relative_to(self.datasets_root.joinpath(self.ui.current_dataset_name)).parts)
            # speaker_name = self.ui.current_speaker_name.replace("\\", "-").replace("/", "-")
            # Select the next utterance
            if self.ui.auto_next_checkbox.isChecked():
                self.ui.browser_select_next()
        elif fpath == "":
            return
        else:
            name = fpath.name
            speaker_name = fpath.parent.name

        # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
        # playback, so as to have a fair comparison with the generated audio
        wav = Synthesizer.load_preprocess_wav(fpath)
        self.ui.log("Loaded %s" % name)

        self.add_real_utterance(wav, name, speaker_name)
Beispiel #15
0
    def add_real_utterance(self, wav, name, speaker_name):
        # Compute the mel spectrogram
        spec = Synthesizer.make_spectrogram(wav)
        self.ui.draw_spec(spec, "current")

        # Compute the embedding
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(
            encoder_wav, return_partials=True)

        np.save(self._out_embed_dir.joinpath(name + '.npy'),
                embed,
                allow_pickle=False)  # save

        # Add the utterance
        utterance = Utterance(name, speaker_name, wav, spec, embed,
                              partial_embeds, False)
        self.utterances.add(utterance)
        self.ui.register_utterance(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "current")
        self.ui.draw_umap_projections(self.utterances)
Beispiel #16
0
    def synthesize(self):
        self.ui.log("Generating the mel spectrogram...")
        self.ui.set_loading(1)

        # Synthesize the spectrogram
        if self.synthesizer is None:
            model_dir = self.ui.current_synthesizer_model_dir
            checkpoints_dir = model_dir.joinpath("checkpoints")
            self.synthesizer = Synthesizer(checkpoints_dir,
                                           low_mem=self.low_mem)
        if not self.synthesizer.is_loaded():
            self.ui.log("Loading the synthesizer %s" %
                        self.synthesizer.checkpoint_fpath)

        ptext = self.ui.text_prompt.toPlainText()
        texts = ptext.split("\n")

        embed = self.ui.selected_utterance.embed
        embeds = np.stack([embed] * len(texts))
        specs = self.synthesizer.synthesize_spectrograms(texts, embeds)

        # 去除前后安静或噪声部分
        for num, spec in enumerate(specs):
            tmp = spec.T
            sidx, eidx = find_start_end_points(tmp)
            specs[num] = tmp[sidx:eidx].T

        # specs = [spec.T[:find_endpoint(spec.T)].T for spec in specs]  # find endpoint
        breaks = [spec.shape[1] for spec in specs]
        spec = np.concatenate(specs, axis=1)

        fref = '-'.join([
            self.ui.current_dataset_name, self.ui.current_speaker_name,
            self.ui.current_utterance_name
        ])
        ftext = '。'.join(texts)
        ftime = '{}'.format(time_formatter())
        fname = filename_formatter('{}_{}_{}zi_{}.npy'.format(
            fref, ftime, len(ftext), ftext))
        np.save(self._out_mel_dir.joinpath(fname), spec,
                allow_pickle=False)  # save

        self.ui.draw_spec(spec, "generated")
        self.current_generated = (self.ui.selected_utterance.speaker_name,
                                  spec, breaks, None)
        self.ui.set_loading(0)
Beispiel #17
0
def setup():
    global synthesizer
    encoder_weights = Path("encoder/saved_models/pretrained.pt")
    vocoder_weights = Path("vocoder/saved_models/pretrained/pretrained.pt")
    syn_dir = Path("synthesizer/saved_models/logs-pretrained/taco_pretrained")
    encoder.load_model(encoder_weights)
    synthesizer = Synthesizer(syn_dir)
    vocoder.load_model(vocoder_weights)
Beispiel #18
0
    def init_synthesizer(self):
        model_fpath = self.ui.current_synthesizer_fpath

        self.ui.log("Loading the synthesizer %s... " % model_fpath)
        self.ui.set_loading(1)
        start = timer()
        self.synthesizer = Synthesizer(model_fpath)
        self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
        self.ui.set_loading(0)
Beispiel #19
0
def load_models():
    #encoder_weights = Path(encoder_path)
    vocoder_weights = Path(vocoder_path)
    syn_dir = Path(synthesizer_path)
    #encoder.load_model(encoder_weights)
    synthesizer = Synthesizer(syn_dir)
    vocoder.load_model(vocoder_weights)

    return encoder, synthesizer, vocoder
Beispiel #20
0
    def record(self):
        wav = self.ui.record_one(encoder.sampling_rate, 5)
        if wav is None:
            return

        self.ui.play(wav, encoder.sampling_rate)

        speaker_name = "user01"
        name = speaker_name + "_rec_{}".format(time_formatter())
        fpath = self._out_record_dir.joinpath(name + '.wav')
        audio.save_wav(wav, fpath, encoder.sampling_rate)  # save
        wav = Synthesizer.load_preprocess_wav(fpath)  # 保持一致的数据格式

        self.add_real_utterance(wav, name, speaker_name)
Beispiel #21
0
    def preprocess(self):
        wav = self.ui.selected_utterance.wav
        out = aukit.remove_noise(wav, sr=Synthesizer.sample_rate)
        hp = aukit.Dict2Obj({})
        hp["vad_window_length"] = 10  # milliseconds
        hp["vad_moving_average_width"] = 2
        hp["vad_max_silence_length"] = 2
        hp["audio_norm_target_dBFS"] = -32
        hp["sample_rate"] = 16000
        hp["int16_max"] = (2**15) - 1
        out = trim_long_silences(out, hparams=hp)

        spec = Synthesizer.make_spectrogram(out)
        self.ui.draw_align(spec[::-1], "current")
Beispiel #22
0
    def clone_voice(self, embed):
        synthesizer = Synthesizer("synthesizer/saved_models/logs-pretrained/taco_pretrained")
        vocoder.load_model("vocoder/saved_models/pretrained/pretrained.pt")
        with open(self.json_text) as text_json:
            data = json.load(text_json)
            for x in data:
                text = x['translation']
                # The synthesizer works in batch, so you need to put your data in a list or numpy array
                texts = [text]
                embeds = [embed]
                # If you know what the attention layer alignments are, you can retrieve them here by
                # passing return_alignments=True
                specs = synthesizer.synthesize_spectrograms(texts, embeds)
                spec = specs[0]

                ## Generating the waveform
                print("\nSynthesizing the waveform:")
                # Synthesizing the waveform is fairly straightforward. Remember that the longer the
                # spectrogram, the more time-efficient the vocoder.
                generated_wav = vocoder.infer_waveform(spec)

                ## Post-generation
                # There's a bug with sounddevice that makes the audio cut one second earlier, so we
                # pad it.
                generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")

                # Save it on the disk
                output_dir = '../temp'
                try:
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                except:
                    pass
                fpath = "%s/%d.wav" % (output_dir, x['index'])
                generated_wav *= 32767 / max(0.01, np.max(np.abs(generated_wav)))
                wavfile.write(fpath, synthesizer.sample_rate, generated_wav.astype(np.int16))
    def load_from_browser(self, fpath=None):
        if fpath is None:
            fpath = Path(self.datasets_root,
                         self.ui.current_dataset_name,
                         self.ui.current_speaker_name,
                         self.ui.current_utterance_name)
            name = str(fpath.relative_to(self.datasets_root))
            speaker_name = self.ui.current_dataset_name + '_' + self.ui.current_speaker_name
            
            # Select the next utterance
            if self.ui.auto_next_checkbox.isChecked():
                self.ui.browser_select_next()
        else:
            name = fpath.name
            speaker_name = fpath.parent.name
        
        # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
        # playback, so as to have a fair comparison with the generated audio
        wav = Synthesizer.load_preprocess_wav(fpath)
        self.ui.log("Loaded %s" % name)

        self.add_real_utterance(wav, name, speaker_name)
Beispiel #24
0
class Toolbox:
    def __init__(self, datasets_root, enc_models_dir, syn_models_dir,
                 voc_models_dir, low_mem):
        sys.excepthook = self.excepthook
        self.datasets_root = datasets_root
        self.low_mem = low_mem
        self.utterances = set()
        self.current_generated = (None, None, None, None
                                  )  # speaker_name, spec, breaks, wav

        self.synthesizer = None  # type: Synthesizer

        # Initialize the events and the interface
        self.ui = UI()
        self.reset_ui(enc_models_dir, syn_models_dir, voc_models_dir)
        self.setup_events()
        self.ui.start()

    def excepthook(self, exc_type, exc_value, exc_tb):
        traceback.print_exception(exc_type, exc_value, exc_tb)
        self.ui.log("Exception: %s" % exc_value)

    def setup_events(self):
        # Dataset, speaker and utterance selection
        self.ui.browser_load_button.clicked.connect(
            lambda: self.load_from_browser())
        random_func = lambda level: lambda: self.ui.populate_browser(
            self.datasets_root, recognized_datasets, level)
        self.ui.random_dataset_button.clicked.connect(random_func(0))
        self.ui.random_speaker_button.clicked.connect(random_func(1))
        self.ui.random_utterance_button.clicked.connect(random_func(2))
        self.ui.dataset_box.currentIndexChanged.connect(random_func(1))
        self.ui.speaker_box.currentIndexChanged.connect(random_func(2))

        # Model selection
        self.ui.encoder_box.currentIndexChanged.connect(self.init_encoder)

        def func():
            self.synthesizer = None

        self.ui.synthesizer_box.currentIndexChanged.connect(func)
        self.ui.vocoder_box.currentIndexChanged.connect(self.init_vocoder)

        # Utterance selection
        func = lambda: self.load_from_browser(self.ui.browse_file())
        self.ui.browser_browse_button.clicked.connect(func)
        func = lambda: self.ui.draw_utterance(self.ui.selected_utterance,
                                              "current")
        self.ui.utterance_history.currentIndexChanged.connect(func)
        func = lambda: self.ui.play(self.ui.selected_utterance.wav, Synthesizer
                                    .sample_rate)
        self.ui.play_button.clicked.connect(func)
        self.ui.stop_button.clicked.connect(self.ui.stop)
        self.ui.record_button.clicked.connect(self.record)

        # Generation
        func = lambda: self.synthesize() or self.vocode()
        self.ui.generate_button.clicked.connect(func)
        self.ui.synthesize_button.clicked.connect(self.synthesize)
        self.ui.vocode_button.clicked.connect(self.vocode)

        # UMAP legend
        self.ui.clear_button.clicked.connect(self.clear_utterances)

    def reset_ui(self, encoder_models_dir, synthesizer_models_dir,
                 vocoder_models_dir):
        self.ui.populate_browser(self.datasets_root, recognized_datasets, 0,
                                 True)
        self.ui.populate_models(encoder_models_dir, synthesizer_models_dir,
                                vocoder_models_dir)

    def load_from_browser(self, fpath=None):
        if fpath is None:
            fpath = Path(self.datasets_root, self.ui.current_dataset_name,
                         self.ui.current_speaker_name,
                         self.ui.current_utterance_name)
            name = str(fpath.relative_to(self.datasets_root))
            speaker_name = self.ui.current_dataset_name + '_' + self.ui.current_speaker_name

            # Select the next utterance
            if self.ui.auto_next_checkbox.isChecked():
                self.ui.browser_select_next()
        elif fpath == "":
            return
        else:
            name = fpath.name
            speaker_name = fpath.parent.name

        # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
        # playback, so as to have a fair comparison with the generated audio
        wav = Synthesizer.load_preprocess_wav(fpath)
        self.ui.log("Loaded %s" % name)

        self.add_real_utterance(wav, name, speaker_name)

    def record(self):
        wav = self.ui.record_one(encoder.sampling_rate, 5)
        if wav is None:
            return
        self.ui.play(wav, encoder.sampling_rate)

        speaker_name = "user01"
        name = speaker_name + "_rec_%05d" % np.random.randint(100000)
        self.add_real_utterance(wav, name, speaker_name)

    def add_real_utterance(self, wav, name, speaker_name):
        # Compute the mel spectrogram
        spec = Synthesizer.make_spectrogram(wav)
        self.ui.draw_spec(spec, "current")

        # Compute the embedding
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.load_preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(
            encoder_wav, return_partials=True)

        # Add the utterance
        utterance = Utterance(name, speaker_name, wav, spec, embed,
                              partial_embeds, False)
        self.utterances.add(utterance)
        self.ui.register_utterance(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "current")
        self.ui.draw_umap_projections(self.utterances)

    def clear_utterances(self):
        self.utterances.clear()
        self.ui.draw_umap_projections(self.utterances)

    def synthesize(self):
        self.ui.log("Generating the mel spectrogram...")
        self.ui.set_loading(1)

        # Synthesize the spectrogram
        if self.synthesizer is None:
            model_dir = self.ui.current_synthesizer_model_dir
            checkpoints_dir = model_dir.joinpath("taco_pretrained")
            self.synthesizer = Synthesizer(checkpoints_dir,
                                           low_mem=self.low_mem)
        if not self.synthesizer.is_loaded():
            self.ui.log("Loading the synthesizer %s" %
                        self.synthesizer.checkpoint_fpath)

        texts = self.ui.text_prompt.toPlainText().split("\n")
        embed = self.ui.selected_utterance.embed
        embeds = np.stack([embed] * len(texts))
        specs = self.synthesizer.synthesize_spectrograms(texts, embeds)
        breaks = [spec.shape[1] for spec in specs]
        spec = np.concatenate(specs, axis=1)

        self.ui.draw_spec(spec, "generated")
        self.current_generated = (self.ui.selected_utterance.speaker_name,
                                  spec, breaks, None)
        self.ui.set_loading(0)

    def vocode(self):
        speaker_name, spec, breaks, _ = self.current_generated
        assert spec is not None

        # Synthesize the waveform
        if not vocoder.is_loaded():
            self.init_vocoder()

        def vocoder_progress(i, seq_len, b_size, gen_rate):
            real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
            line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
                   % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
            self.ui.log(line, "overwrite")
            self.ui.set_loading(i, seq_len)

        if self.ui.current_vocoder_fpath is not None:
            self.ui.log("")
            wav = vocoder.infer_waveform(spec,
                                         progress_callback=vocoder_progress)
        else:
            self.ui.log("Waveform generation with Griffin-Lim... ")
            wav = Synthesizer.griffin_lim(spec)
        self.ui.set_loading(0)
        self.ui.log(" Done!", "append")

        # Add breaks
        b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
        b_starts = np.concatenate(([0], b_ends[:-1]))
        wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
        breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
        wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])

        # Play it
        wav = wav / np.abs(wav).max() * 0.97
        self.ui.play(wav, Synthesizer.sample_rate)

        # Compute the embedding
        # TODO: this is problematic with different sampling rates, gotta fix it
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.load_preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(
            encoder_wav, return_partials=True)

        # Add the utterance
        name = speaker_name + "_gen_%05d" % np.random.randint(100000)
        utterance = Utterance(name, speaker_name, wav, spec, embed,
                              partial_embeds, True)
        self.utterances.add(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "generated")
        self.ui.draw_umap_projections(self.utterances)

    def init_encoder(self):
        model_fpath = self.ui.current_encoder_fpath

        self.ui.log("Loading the encoder %s... " % model_fpath)
        self.ui.set_loading(1)
        start = timer()
        encoder.load_model(model_fpath)
        self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
        self.ui.set_loading(0)

    def init_vocoder(self):
        model_fpath = self.ui.current_vocoder_fpath
        # Case of Griffin-lim
        if model_fpath is None:
            return

        self.ui.log("Loading the vocoder %s... " % model_fpath)
        self.ui.set_loading(1)
        start = timer()
        vocoder.load_model(model_fpath)
        self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
        self.ui.set_loading(0)
Beispiel #25
0
            "not supported.",
            file=sys.stderr)
        quit(-1)
    device_id = torch.cuda.current_device()
    gpu_properties = torch.cuda.get_device_properties(device_id)
    print(
        "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
        "%.1fGb total memory.\n" %
        (torch.cuda.device_count(), device_id, gpu_properties.name,
         gpu_properties.major, gpu_properties.minor,
         gpu_properties.total_memory / 1e9))

    ## Load the models one by one.
    print("Preparing the encoder, the synthesizer and the vocoder...")
    encoder.load_model(args.enc_model_fpath)
    synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"),
                              low_mem=args.low_mem)
    #vocoder.load_model(args.voc_model_fpath)

    ## Run a test
    print("Testing your configuration with small inputs.")
    # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
    # sampling rate, which may differ.
    # If you're unfamiliar with digital audio, know that it is encoded as an array of floats
    # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
    # The sampling rate is the number of values (samples) recorded per second, it is set to
    # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
    # to an audio of 1 second.
    print("\tTesting the encoder...")
    encoder.embed_utterance(np.zeros(encoder.sampling_rate))

    # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
def clone(audio=None, audio_url=None, sentence=""):
    try:
        if not 10 <= len(sentence.split(" ")) <= 30:
            return {"error": "Sentence is invalid! (length must be 10 to 30 words)"}
        audio_data = audio
        if audio_url:
            # Link
            if "http://" in audio_url or "https://" in audio_url:
                header = {'User-Agent': 'Mozilla/5.0 (Windows NT x.y; Win64; x64; rv:9.0) Gecko/20100101 Firefox/10.0'}
                # Check if audio file has less than 5Mb
                r = requests.head(audio_url, headers=header, allow_redirects=True)
                size = r.headers.get('content-length', 0)
                size = int(size) / float(1 << 20)
                log.info("File size: {:.2f} Mb".format(size))
                if size > 10:
                    return {"error": "Input audio file is too large! (max 10Mb)"}
                r = requests.get(audio_url, headers=header, allow_redirects=True)
                audio_data = r.content
            # Base64
            elif len(audio_url) > 500:
                audio_data = base64.b64decode(audio_url)

        audio_path = generate_uid() + ".audio"
        with open(audio_path, "wb") as f:
            f.write(audio_data)

        # Load the models one by one.
        log.info("Preparing the encoder, the synthesizer and the vocoder...")
        encoder.load_model(Path("rtvc/encoder/saved_models/pretrained.pt"))
        synthesizer = Synthesizer(Path("rtvc/synthesizer/saved_models/logs-pretrained/taco_pretrained"))
        vocoder.load_model(Path("rtvc/vocoder/saved_models/pretrained/pretrained.pt"))

        # Computing the embedding
        original_wav, sampling_rate = librosa.load(audio_path)
        preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
        log.info("Loaded file successfully")

        if os.path.exists(audio_path):
            os.remove(audio_path)

        embed = encoder.embed_utterance(preprocessed_wav)
        log.info("Created the embedding")

        specs = synthesizer.synthesize_spectrograms([sentence], [embed])
        spec = np.concatenate(specs, axis=1)
        # spec = specs[0]
        log.info("Created the mel spectrogram")

        # Generating the waveform
        log.info("Synthesizing the waveform:")
        generated_wav = vocoder.infer_waveform(spec, progress_callback=lambda *args: None)

        # Post-generation
        # There's a bug with sounddevice that makes the audio cut one second earlier, so we
        # pad it.
        generated_wav = np.pad(generated_wav,
                               (0, synthesizer.sample_rate),
                               mode="constant")

        # Save it on the disk
        fp = tempfile.TemporaryFile()
        librosa.output.write_wav(fp, generated_wav.astype(np.float32), synthesizer.sample_rate)
        return {"audio": fp.read()}

    except Exception as e:
        log.error(e)
        traceback.print_exc()
        return {"error": "Fail"}
        gpu_properties = torch.cuda.get_device_properties(device_id)
        ## Print some environment information (for debugging purposes)
        print(
            "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
            "%.1fGb total memory.\n" %
            (torch.cuda.device_count(), device_id, gpu_properties.name,
             gpu_properties.major, gpu_properties.minor,
             gpu_properties.total_memory / 1e9))
    else:
        print("Using CPU for inference.\n")

    ## Load the models one by one.
    print("Preparing the encoder, the synthesizer and the vocoder...")
    ensure_default_models(Path("saved_models"))
    encoder.load_model(args.enc_model_fpath)
    synthesizer = Synthesizer(args.syn_model_fpath)
    vocoder.load_model(args.voc_model_fpath)

    ## Run a test
    print("Testing your configuration with small inputs.")
    # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
    # sampling rate, which may differ.
    # If you're unfamiliar with digital audio, know that it is encoded as an array of floats
    # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
    # The sampling rate is the number of values (samples) recorded per second, it is set to
    # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
    # to an audio of 1 second.
    print("\tTesting the encoder...")
    encoder.embed_utterance(np.zeros(encoder.sampling_rate))

    # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
Beispiel #28
0
    if torch.cuda.is_available():
        device_id = torch.cuda.current_device()
        gpu_properties = torch.cuda.get_device_properties(device_id)
        print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
              "%.1fGb total memory.\n" %
              (torch.cuda.device_count(),
               device_id,
               gpu_properties.name,
               gpu_properties.major,
               gpu_properties.minor,
               gpu_properties.total_memory / 1e9))

    ## Load the models one by one.
    print("Preparing the encoder, the synthesizer and the vocoder...")
    encoder.load_model(args.enc_model_fpath, device='cpu')
    synthesizer = Synthesizer(args.syn_model_dir, low_mem=args.low_mem)
    # vocoder.load_model(args.voc_model_fpath)

    ## Run a test
    print("Testing your configuration with small inputs.")
    print("\tTesting the encoder...")
    encoder.embed_utterance(np.zeros(encoder.sampling_rate))
    embed = np.random.rand(speaker_embedding_size)
    embed /= np.linalg.norm(embed)
    embeds = [embed, np.zeros(speaker_embedding_size)]
    texts = ["你好", "欢迎使用语音克隆工具"]
    print("\tTesting the synthesizer... (loading the model will output a lot of text)")
    mels = synthesizer.synthesize_spectrograms(texts, embeds)

    mel = np.concatenate(mels, axis=1)
    no_action = lambda *args: None
Beispiel #29
0
 def test_config(self):
     ## Print some environment information (for debugging purposes)
     print("Running a test of your configuration...\n")
     try:
         if not torch.cuda.is_available():
             print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
                   "for deep learning, ensure that the drivers are properly installed, and that your "
                   "CUDA version matches your PyTorch installation. CPU-only inference is currently "
                   "not supported.", file=sys.stderr)
             quit(-1)
         device_id = torch.cuda.current_device()
         gpu_properties = torch.cuda.get_device_properties(device_id)
         print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
               "%.1fGb total memory.\n" % 
               (torch.cuda.device_count(),
                device_id,
                gpu_properties.name,
                gpu_properties.major,
                gpu_properties.minor,
                gpu_properties.total_memory / 1e9))
     
     
         ## Load the models one by one.
         print("Preparing the encoder, the synthesizer and the vocoder...")
         encoder.load_model(self.enc_model_fpath)
         print("Loaded Encoder")
         self.synthesizer = Synthesizer(self.syn_model_dir.joinpath("taco_pretrained"), low_mem=self.low_mem)
         print("Loaded Synth")
         vocoder.load_model(self.voc_model_fpath)
         print("Loaded Vocoder")
         
         ## Run a test
         print("Testing your configuration with small inputs.")
         # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
         # sampling rate, which may differ.
         # If you're unfamiliar with digital audio, know that it is encoded as an array of floats 
         # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
         # The sampling rate is the number of values (samples) recorded per second, it is set to
         # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond 
         # to an audio of 1 second.
         print("\tTesting the encoder...")
         encoder.embed_utterance(np.zeros(encoder.sampling_rate))
         
         # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
         # returns, but here we're going to make one ourselves just for the sake of showing that it's
         # possible.
         embed = np.random.rand(speaker_embedding_size)
         # Embeddings are L2-normalized (this isn't important here, but if you want to make your own 
         # embeddings it will be).
         embed /= np.linalg.norm(embed)
         # The synthesizer can handle multiple inputs with batching. Let's create another embedding to 
         # illustrate that
         embeds = [embed, np.zeros(speaker_embedding_size)]
         texts = ["test 1", "test 2"]
         print("\tTesting the synthesizer... (loading the model will output a lot of text)")
         mels = self.synthesizer.synthesize_spectrograms(texts, embeds)
         
         # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We 
         # can concatenate the mel spectrograms to a single one.
         mel = np.concatenate(mels, axis=1)
         # The vocoder can take a callback function to display the generation. More on that later. For 
         # now we'll simply hide it like this:
         no_action = lambda *args: None
         print("\tTesting the vocoder...")
         # For the sake of making this test short, we'll pass a short target length. The target length 
         # is the length of the wav segments that are processed in parallel. E.g. for audio sampled 
         # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
         # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and 
         # that has a detrimental effect on the quality of the audio. The default parameters are 
         # recommended in general.
         vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
         
         print("\tAll test passed!")
         
         return("All test passed!")
         
     except Exception as e:
         return("Caught exception: %s" % repr(e))
Beispiel #30
0
class VoiceClone:
    
    def __init__(self,
                 audio=params.DATASETS_ROOT,
                 text=params.INPUT_TEXT,
                 output_dir=params.OUTPUT_DIR):
        
        sys.excepthook = self.excepthook
        self.datasets_root = audio
        self.enc_model_fpath = Path(params.ENC_MODEL_FPATH)
        self.syn_model_dir = Path(params.SYN_MODEL_DIR)
        self.voc_model_fpath = Path(params.VOC_MODEL_FPATH)
        self.low_mem = params.LOW_MEM
        self.synthesizer = None # type: Synthesizer
        
        # Added to point directory of input and output directories
        self.input_text = text
        self.output_dir = output_dir
                
    
    def excepthook(self, exc_type, exc_value, exc_tb):
        traceback.print_exception(exc_type, exc_value, exc_tb)
        self.ui.log("Exception: %s" % exc_value)
    
    def load_models(self):
        if not torch.cuda.is_available():
                print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
                      "for deep learning, ensure that the drivers are properly installed, and that your "
                      "CUDA version matches your PyTorch installation. CPU-only inference is currently "
                      "not supported.", file=sys.stderr)
                quit(-1)
        device_id = torch.cuda.current_device()
        gpu_properties = torch.cuda.get_device_properties(device_id)
        print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
              "%.1fGb total memory.\n" % 
              (torch.cuda.device_count(),
               device_id,
               gpu_properties.name,
               gpu_properties.major,
               gpu_properties.minor,
               gpu_properties.total_memory / 1e9))
    
    
        ## Load the models one by one.
        print("Preparing the encoder, the synthesizer and the vocoder...")
        encoder.load_model(self.enc_model_fpath)
        print("Loaded Encoder")
        self.synthesizer = Synthesizer(self.syn_model_dir.joinpath("taco_pretrained"), low_mem=self.low_mem)
        print("Loaded Synth")
        vocoder.load_model(self.voc_model_fpath)
        print("Loaded Vocoder")
        
    
    def test_config(self):
        ## Print some environment information (for debugging purposes)
        print("Running a test of your configuration...\n")
        try:
            if not torch.cuda.is_available():
                print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
                      "for deep learning, ensure that the drivers are properly installed, and that your "
                      "CUDA version matches your PyTorch installation. CPU-only inference is currently "
                      "not supported.", file=sys.stderr)
                quit(-1)
            device_id = torch.cuda.current_device()
            gpu_properties = torch.cuda.get_device_properties(device_id)
            print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
                  "%.1fGb total memory.\n" % 
                  (torch.cuda.device_count(),
                   device_id,
                   gpu_properties.name,
                   gpu_properties.major,
                   gpu_properties.minor,
                   gpu_properties.total_memory / 1e9))
        
        
            ## Load the models one by one.
            print("Preparing the encoder, the synthesizer and the vocoder...")
            encoder.load_model(self.enc_model_fpath)
            print("Loaded Encoder")
            self.synthesizer = Synthesizer(self.syn_model_dir.joinpath("taco_pretrained"), low_mem=self.low_mem)
            print("Loaded Synth")
            vocoder.load_model(self.voc_model_fpath)
            print("Loaded Vocoder")
            
            ## Run a test
            print("Testing your configuration with small inputs.")
            # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
            # sampling rate, which may differ.
            # If you're unfamiliar with digital audio, know that it is encoded as an array of floats 
            # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
            # The sampling rate is the number of values (samples) recorded per second, it is set to
            # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond 
            # to an audio of 1 second.
            print("\tTesting the encoder...")
            encoder.embed_utterance(np.zeros(encoder.sampling_rate))
            
            # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
            # returns, but here we're going to make one ourselves just for the sake of showing that it's
            # possible.
            embed = np.random.rand(speaker_embedding_size)
            # Embeddings are L2-normalized (this isn't important here, but if you want to make your own 
            # embeddings it will be).
            embed /= np.linalg.norm(embed)
            # The synthesizer can handle multiple inputs with batching. Let's create another embedding to 
            # illustrate that
            embeds = [embed, np.zeros(speaker_embedding_size)]
            texts = ["test 1", "test 2"]
            print("\tTesting the synthesizer... (loading the model will output a lot of text)")
            mels = self.synthesizer.synthesize_spectrograms(texts, embeds)
            
            # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We 
            # can concatenate the mel spectrograms to a single one.
            mel = np.concatenate(mels, axis=1)
            # The vocoder can take a callback function to display the generation. More on that later. For 
            # now we'll simply hide it like this:
            no_action = lambda *args: None
            print("\tTesting the vocoder...")
            # For the sake of making this test short, we'll pass a short target length. The target length 
            # is the length of the wav segments that are processed in parallel. E.g. for audio sampled 
            # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
            # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and 
            # that has a detrimental effect on the quality of the audio. The default parameters are 
            # recommended in general.
            vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
            
            print("\tAll test passed!")
            
            return("All test passed!")
            
        except Exception as e:
            return("Caught exception: %s" % repr(e))
        
    def compute_embedding(self, spk_file):
        in_fpath = spk_file
                    
        ## Computing the embedding
        # First, we load the wav using the function that the speaker encoder provides. This is 
        # important: there is preprocessing that must be applied.
        
        # The following two methods are equivalent:
        # - Directly load from the filepath:
        preprocessed_wav = encoder.preprocess_wav(in_fpath)
        # - If the wav is already loaded:
        original_wav, sampling_rate = librosa.load(in_fpath)
        preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
        print("Loaded file succesfully")
        
        # Then we derive the embedding. There are many functions and parameters that the 
        # speaker encoder interfaces. These are mostly for in-depth research. You will typically
        # only use this function (with its default parameters):
        embed = encoder.embed_utterance(preprocessed_wav)
        print("Created the embedding\n")
        
        return embed
    
    def parse_text(self):
        lineList = [line.rstrip('\n') for line in open(self.input_text)]
        return lineList
    
    def gen_spect(self, embed, text):
        # The synthesizer works in batch, so you need to put your data in a list or numpy array
        embeds = np.stack([embed] * len(text))
        specs = self.synthesizer.synthesize_spectrograms(text, embeds)
        breaks = [spec.shape[1] for spec in specs]
        spec = np.concatenate(specs, axis=1)
        
        print("Created the mel spectrogram\n")
        
        return spec, breaks
    
    def vocode(self, spec, breaks):
        ## Generating the waveform
        print("Synthesizing the waveform:")
        # Synthesizing the waveform is fairly straightforward. Remember that the longer the
        # spectrogram, the more time-efficient the vocoder.
        wav = vocoder.infer_waveform(spec)
        
        # Add breaks
        b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
        b_starts = np.concatenate(([0], b_ends[:-1]))
        wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
        breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
        wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
        
        ## Post-generation
        # There's a bug with sounddevice that makes the audio cut one second earlier, so we
        # pad it.
        wav = np.pad(wav, (0, self.synthesizer.sample_rate), mode="constant")
        return wav
    
    def save_to_disk(self, generated_wav, spk):
        # Save it on the disk
        fpath = "output_%s.wav" % spk
        out_path = os.path.join(self.output_dir,fpath)
        librosa.output.write_wav(out_path, generated_wav.astype(np.float32), 
                                 self.synthesizer.sample_rate)
        
        print("\nSaved output as %s\n\n" % fpath)

    def synt_speech(self):        
        print("Starting web service")
        #num_generated = 0
        try:
            # Load encoder, synthesizer and vocoder models
            print("Loading models...\n")
            self.load_models()
            
            # Load script into a list
            text = self.parse_text()
        
            # Get the reference audio filepath
            spk_folders = os.listdir(self.datasets_root)
            
            for spk in spk_folders:
                print("Processing Speaker: {}".format(spk))
                spk_dir = os.path.join(self.datasets_root,spk)
                input_dir = os.path.join(spk_dir,"*.wav")
                spk_files_list = glob.glob(input_dir)
                print("Total number of audio files in directory: {}\n".format(len(spk_files_list)))
                print(spk_files_list)

                for spk_file in spk_files_list:
                    embed = self.compute_embedding(spk_file)
                    spec, breaks = self.gen_spect(embed, text)
                    generated_wav = self.vocode(spec, breaks)
                    self.save_to_disk(generated_wav, spk)

            return ("Done. Processed: {} speakers".format(len(spk_folders)))

        except Exception as e:
            print("Caught exception: %s" % repr(e))