Beispiel #1
0
def DeepTalk_synthesizer(encoder_embedding,
                         output_text,
                         model_save_path,
                         low_mem=False):
    synthesizer = Synthesizer(model_save_path, low_mem=low_mem)
    texts = output_text
    texts = texts.split("\n")
    embeds = np.stack([encoder_embedding] * len(texts))
    specs = synthesizer.synthesize_spectrograms(texts, embeds)
    breaks = [spec.shape[1] for spec in specs]
    spec = np.concatenate(specs, axis=1)
    mel = spec

    return mel, breaks
Beispiel #2
0
    def clone_voice(self, embed):
        synthesizer = Synthesizer("synthesizer/saved_models/logs-pretrained/taco_pretrained")
        vocoder.load_model("vocoder/saved_models/pretrained/pretrained.pt")
        with open(self.json_text) as text_json:
            data = json.load(text_json)
            for x in data:
                text = x['translation']
                # The synthesizer works in batch, so you need to put your data in a list or numpy array
                texts = [text]
                embeds = [embed]
                # If you know what the attention layer alignments are, you can retrieve them here by
                # passing return_alignments=True
                specs = synthesizer.synthesize_spectrograms(texts, embeds)
                spec = specs[0]

                ## Generating the waveform
                print("\nSynthesizing the waveform:")
                # Synthesizing the waveform is fairly straightforward. Remember that the longer the
                # spectrogram, the more time-efficient the vocoder.
                generated_wav = vocoder.infer_waveform(spec)

                ## Post-generation
                # There's a bug with sounddevice that makes the audio cut one second earlier, so we
                # pad it.
                generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")

                # Save it on the disk
                output_dir = '../temp'
                try:
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                except:
                    pass
                fpath = "%s/%d.wav" % (output_dir, x['index'])
                generated_wav *= 32767 / max(0.01, np.max(np.abs(generated_wav)))
                wavfile.write(fpath, synthesizer.sample_rate, generated_wav.astype(np.int16))
def run(args: argparse.Namespace):
    # Load encoder model
    encoder = EncoderModel()
    encoder.load(Path('encoder/saved_models/pretrained.pt'))
    # Synthesize the spectrogram
    synthesizer = Synthesizer(
        Path('synthesizer/saved_models/logs-pretrained/taco_pretrained'))
    # Load vocoder
    vocoder = VocoderModel()
    vocoder.load_from(Path('vocoder/saved_models/pretrained/pretrained.pt'),
                      verbose=False)

    # [p304, p305, ...]
    speaker_dirs = [f.parts[-1] for f in _WAV_FODLER.glob("*") if f.is_dir()]
    if len(speaker_dirs) == 0:
        raise Exception(
            "No speakers found. Make sure you are pointing to the directory")

    # 'p304' -> [001.wav, 002.wav, ...]
    speaker_utterances = dict()  # type: typing.Dict[str, typing.List[str]]
    for d in speaker_dirs:
        speaker_utterances[d] = [
            w.parts[-1] for w in _WAV_FODLER.joinpath(d).glob('*.wav')
        ]

    speaker_embeddings = dict()  # type: typing.Dict[str, np.ndarray]
    for d in speaker_utterances:
        utterances = speaker_utterances[d]
        enrollments = utterances[:10]
        logging.error(f'speaker: {d}, enrollments: {enrollments}')
        speaker_embeddings[d] = encoder.embed_speaker(
            [_WAV_FODLER.joinpath(d, u) for u in enrollments])

    # Same speaker attack
    for d in speaker_utterances:
        utterances = speaker_utterances[d]
        # Repeat 5 times
        for utterance in np.random.choice(utterances, size=5,
                                          replace=False):  # type: str
            # generated audio
            txt = _TXT_FODLER.joinpath(d, utterance).with_suffix('.txt')
            text = txt.read_text()

            # original audio
            utterance_embedding = encoder.embed_utterance(
                _WAV_FODLER.joinpath(d, utterance),
                source_sr=Synthesizer.sample_rate)
            cosine_similarity = 1.0 - scipy.spatial.distance.cosine(
                speaker_embeddings[d], utterance_embedding)
            logging.error(
                f'ori: speaker: {d}, utterance: {utterance}, text: {text}, sim: {cosine_similarity}'
            )

            specs = synthesizer.synthesize_spectrograms(
                [text], [speaker_embeddings[d]])
            spec = np.concatenate(specs, axis=1)
            wav = vocoder.infer_waveform(spec)

            utterance_embedding = encoder.embed_utterance(
                wav, source_sr=Synthesizer.sample_rate)
            cosine_similarity = 1.0 - scipy.spatial.distance.cosine(
                speaker_embeddings[d], utterance_embedding)
            logging.error(
                f'gen: speaker: {d}, utterance: {utterance}, text: {text}, sim: {cosine_similarity}'
            )

            # Save wav
            filename = f'/tmp/gen-{d}-{utterance}'
            soundfile.write(filename, wav, Synthesizer.sample_rate, 'PCM_16')
            logging.error(f"Saved audio to {filename}")
Beispiel #4
0
class Generator:
    enc_model_fpath = Path("encoder/saved_models/pretrained.pt")
    voc_model_fpath = Path("vocoder/saved_models/pretrained/pretrained.pt")
    syn_model_dir = Path("synthesizer/saved_models/logs-pretrained/")
    synthesizer = None

    def __init__(self):
        self.synthesizer = Synthesizer(
            self.syn_model_dir.joinpath("taco_pretrained"), low_mem=False)

    def initialize(self):
        print("Running a test of your configuration...\n")
        if not torch.cuda.is_available():
            print(
                "Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
                "for deep learning, ensure that the drivers are properly installed, and that your "
                "CUDA version matches your PyTorch installation. CPU-only inference is currently "
                "not supported.")
            quit(-1)
        print("PyTorch is available and working...")
        device_id = torch.cuda.current_device()
        gpu_properties = torch.cuda.get_device_properties(device_id)
        print(
            "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
            "%.1fGb total memory.\n" %
            (torch.cuda.device_count(), device_id, gpu_properties.name,
             gpu_properties.major, gpu_properties.minor,
             gpu_properties.total_memory / 1e9))
        ## Load the models one by one.

        print("Preparing the encoder, the synthesizer and the vocoder...")
        encoder.load_model(self.enc_model_fpath)

        vocoder.load_model(self.voc_model_fpath)

        ## Run a test
        print("Testing your configuration with small inputs.")
        print("\tTesting the encoder...")
        encoder.embed_utterance(np.zeros(encoder.sampling_rate))

        embed = np.random.rand(speaker_embedding_size)
        embed /= np.linalg.norm(embed)
        embeds = [embed, np.zeros(speaker_embedding_size)]
        texts = ["test 1", "test 2"]
        print(
            "\tTesting the synthesizer... (loading the model will output a lot of text)"
        )
        mels = self.synthesizer.synthesize_spectrograms(texts, embeds)

        mel = np.concatenate(mels, axis=1)
        no_action = lambda *args: None
        print("\tTesting the vocoder...")
        vocoder.infer_waveform(mel,
                               target=200,
                               overlap=50,
                               progress_callback=no_action)
        print("All test passed! You can now synthesize speech.\n\n")

    def generate_voice(self, in_fpath, text, out_fpath):
        try:
            preprocessed_wav = encoder.preprocess_wav(in_fpath)
            original_wav, sampling_rate = librosa.load(in_fpath)
            preprocessed_wav = encoder.preprocess_wav(original_wav,
                                                      sampling_rate)
            print("Loaded file successfully")

            embed = encoder.embed_utterance(preprocessed_wav)
            print("Created the embedding")

            texts = [text]
            embeds = [embed]
            # If you know what the attention layer alignments are, you can retrieve them here by
            # passing return_alignments=True
            specs = self.synthesizer.synthesize_spectrograms(texts, embeds)
            spec = specs[0]
            print("Created the mel spectrogram")
            ## Generating the waveform
            print("Synthesizing the waveform:")
            generated_wav = vocoder.infer_waveform(spec)

            generated_wav = np.pad(generated_wav,
                                   (0, self.synthesizer.sample_rate),
                                   mode="constant")
            librosa.output.write_wav(out_fpath,
                                     generated_wav.astype(np.float32),
                                     self.synthesizer.sample_rate)
            print("\nSaved output as %s\n\n" % out_fpath)

        except Exception as e:
            print("Caught exception: %s" % repr(e))
            print("Restarting\n")
Beispiel #5
0
    # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
    # returns, but here we're going to make one ourselves just for the sake of showing that it's
    # possible.
    embed = np.random.rand(speaker_embedding_size)
    # Embeddings are L2-normalized (this isn't important here, but if you want to make your own
    # embeddings it will be).
    embed /= np.linalg.norm(embed)
    # The synthesizer can handle multiple inputs with batching. Let's create another embedding to
    # illustrate that
    embeds = [embed, np.zeros(speaker_embedding_size)]
    texts = ["test 1", "test 2"]
    print(
        "\tTesting the synthesizer... (loading the model will output a lot of text)"
    )
    mels = synthesizer.synthesize_spectrograms(texts, embeds)

    # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
    # can concatenate the mel spectrograms to a single one.
    mel = np.concatenate(mels, axis=1)
    # The vocoder can take a callback function to display the generation. More on that later. For
    # now we'll simply hide it like this:
    no_action = lambda *args: None
    print("\tTesting the vocoder...")
    # For the sake of making this test short, we'll pass a short target length. The target length
    # is the length of the wav segments that are processed in parallel. E.g. for audio sampled
    # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
    # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
    # that has a detrimental effect on the quality of the audio. The default parameters are
    # recommended in general.
    #vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
Beispiel #6
0
class VoiceClone:
    
    def __init__(self,
                 audio=params.DATASETS_ROOT,
                 text=params.INPUT_TEXT,
                 output_dir=params.OUTPUT_DIR):
        
        sys.excepthook = self.excepthook
        self.datasets_root = audio
        self.enc_model_fpath = Path(params.ENC_MODEL_FPATH)
        self.syn_model_dir = Path(params.SYN_MODEL_DIR)
        self.voc_model_fpath = Path(params.VOC_MODEL_FPATH)
        self.low_mem = params.LOW_MEM
        self.synthesizer = None # type: Synthesizer
        
        # Added to point directory of input and output directories
        self.input_text = text
        self.output_dir = output_dir
                
    
    def excepthook(self, exc_type, exc_value, exc_tb):
        traceback.print_exception(exc_type, exc_value, exc_tb)
        self.ui.log("Exception: %s" % exc_value)
    
    def load_models(self):
        if not torch.cuda.is_available():
                print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
                      "for deep learning, ensure that the drivers are properly installed, and that your "
                      "CUDA version matches your PyTorch installation. CPU-only inference is currently "
                      "not supported.", file=sys.stderr)
                quit(-1)
        device_id = torch.cuda.current_device()
        gpu_properties = torch.cuda.get_device_properties(device_id)
        print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
              "%.1fGb total memory.\n" % 
              (torch.cuda.device_count(),
               device_id,
               gpu_properties.name,
               gpu_properties.major,
               gpu_properties.minor,
               gpu_properties.total_memory / 1e9))
    
    
        ## Load the models one by one.
        print("Preparing the encoder, the synthesizer and the vocoder...")
        encoder.load_model(self.enc_model_fpath)
        print("Loaded Encoder")
        self.synthesizer = Synthesizer(self.syn_model_dir.joinpath("taco_pretrained"), low_mem=self.low_mem)
        print("Loaded Synth")
        vocoder.load_model(self.voc_model_fpath)
        print("Loaded Vocoder")
        
    
    def test_config(self):
        ## Print some environment information (for debugging purposes)
        print("Running a test of your configuration...\n")
        try:
            if not torch.cuda.is_available():
                print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
                      "for deep learning, ensure that the drivers are properly installed, and that your "
                      "CUDA version matches your PyTorch installation. CPU-only inference is currently "
                      "not supported.", file=sys.stderr)
                quit(-1)
            device_id = torch.cuda.current_device()
            gpu_properties = torch.cuda.get_device_properties(device_id)
            print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
                  "%.1fGb total memory.\n" % 
                  (torch.cuda.device_count(),
                   device_id,
                   gpu_properties.name,
                   gpu_properties.major,
                   gpu_properties.minor,
                   gpu_properties.total_memory / 1e9))
        
        
            ## Load the models one by one.
            print("Preparing the encoder, the synthesizer and the vocoder...")
            encoder.load_model(self.enc_model_fpath)
            print("Loaded Encoder")
            self.synthesizer = Synthesizer(self.syn_model_dir.joinpath("taco_pretrained"), low_mem=self.low_mem)
            print("Loaded Synth")
            vocoder.load_model(self.voc_model_fpath)
            print("Loaded Vocoder")
            
            ## Run a test
            print("Testing your configuration with small inputs.")
            # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
            # sampling rate, which may differ.
            # If you're unfamiliar with digital audio, know that it is encoded as an array of floats 
            # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
            # The sampling rate is the number of values (samples) recorded per second, it is set to
            # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond 
            # to an audio of 1 second.
            print("\tTesting the encoder...")
            encoder.embed_utterance(np.zeros(encoder.sampling_rate))
            
            # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
            # returns, but here we're going to make one ourselves just for the sake of showing that it's
            # possible.
            embed = np.random.rand(speaker_embedding_size)
            # Embeddings are L2-normalized (this isn't important here, but if you want to make your own 
            # embeddings it will be).
            embed /= np.linalg.norm(embed)
            # The synthesizer can handle multiple inputs with batching. Let's create another embedding to 
            # illustrate that
            embeds = [embed, np.zeros(speaker_embedding_size)]
            texts = ["test 1", "test 2"]
            print("\tTesting the synthesizer... (loading the model will output a lot of text)")
            mels = self.synthesizer.synthesize_spectrograms(texts, embeds)
            
            # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We 
            # can concatenate the mel spectrograms to a single one.
            mel = np.concatenate(mels, axis=1)
            # The vocoder can take a callback function to display the generation. More on that later. For 
            # now we'll simply hide it like this:
            no_action = lambda *args: None
            print("\tTesting the vocoder...")
            # For the sake of making this test short, we'll pass a short target length. The target length 
            # is the length of the wav segments that are processed in parallel. E.g. for audio sampled 
            # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
            # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and 
            # that has a detrimental effect on the quality of the audio. The default parameters are 
            # recommended in general.
            vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
            
            print("\tAll test passed!")
            
            return("All test passed!")
            
        except Exception as e:
            return("Caught exception: %s" % repr(e))
        
    def compute_embedding(self, spk_file):
        in_fpath = spk_file
                    
        ## Computing the embedding
        # First, we load the wav using the function that the speaker encoder provides. This is 
        # important: there is preprocessing that must be applied.
        
        # The following two methods are equivalent:
        # - Directly load from the filepath:
        preprocessed_wav = encoder.preprocess_wav(in_fpath)
        # - If the wav is already loaded:
        original_wav, sampling_rate = librosa.load(in_fpath)
        preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
        print("Loaded file succesfully")
        
        # Then we derive the embedding. There are many functions and parameters that the 
        # speaker encoder interfaces. These are mostly for in-depth research. You will typically
        # only use this function (with its default parameters):
        embed = encoder.embed_utterance(preprocessed_wav)
        print("Created the embedding\n")
        
        return embed
    
    def parse_text(self):
        lineList = [line.rstrip('\n') for line in open(self.input_text)]
        return lineList
    
    def gen_spect(self, embed, text):
        # The synthesizer works in batch, so you need to put your data in a list or numpy array
        embeds = np.stack([embed] * len(text))
        specs = self.synthesizer.synthesize_spectrograms(text, embeds)
        breaks = [spec.shape[1] for spec in specs]
        spec = np.concatenate(specs, axis=1)
        
        print("Created the mel spectrogram\n")
        
        return spec, breaks
    
    def vocode(self, spec, breaks):
        ## Generating the waveform
        print("Synthesizing the waveform:")
        # Synthesizing the waveform is fairly straightforward. Remember that the longer the
        # spectrogram, the more time-efficient the vocoder.
        wav = vocoder.infer_waveform(spec)
        
        # Add breaks
        b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
        b_starts = np.concatenate(([0], b_ends[:-1]))
        wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
        breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
        wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
        
        ## Post-generation
        # There's a bug with sounddevice that makes the audio cut one second earlier, so we
        # pad it.
        wav = np.pad(wav, (0, self.synthesizer.sample_rate), mode="constant")
        return wav
    
    def save_to_disk(self, generated_wav, spk):
        # Save it on the disk
        fpath = "output_%s.wav" % spk
        out_path = os.path.join(self.output_dir,fpath)
        librosa.output.write_wav(out_path, generated_wav.astype(np.float32), 
                                 self.synthesizer.sample_rate)
        
        print("\nSaved output as %s\n\n" % fpath)

    def synt_speech(self):        
        print("Starting web service")
        #num_generated = 0
        try:
            # Load encoder, synthesizer and vocoder models
            print("Loading models...\n")
            self.load_models()
            
            # Load script into a list
            text = self.parse_text()
        
            # Get the reference audio filepath
            spk_folders = os.listdir(self.datasets_root)
            
            for spk in spk_folders:
                print("Processing Speaker: {}".format(spk))
                spk_dir = os.path.join(self.datasets_root,spk)
                input_dir = os.path.join(spk_dir,"*.wav")
                spk_files_list = glob.glob(input_dir)
                print("Total number of audio files in directory: {}\n".format(len(spk_files_list)))
                print(spk_files_list)

                for spk_file in spk_files_list:
                    embed = self.compute_embedding(spk_file)
                    spec, breaks = self.gen_spect(embed, text)
                    generated_wav = self.vocode(spec, breaks)
                    self.save_to_disk(generated_wav, spk)

            return ("Done. Processed: {} speakers".format(len(spk_folders)))

        except Exception as e:
            print("Caught exception: %s" % repr(e))
Beispiel #7
0
            # Then we derive the embedding. There are many functions and parameters that the
            # speaker encoder interfaces. These are mostly for in-depth research. You will typically
            # only use this function (with its default parameters):
            embed = encoder.embed_utterance(preprocessed_wav)
            print("Created the embedding")

            ## Generating the spectrogram
            # text = input("Write a sentence (+-20 words) to be synthesized:\n")
            # The synthesizer works in batch, so you need to put your data in a list or numpy array
            embeds = [embed]
            # If you know what the attention layer alignments are, you can retrieve them here by
            # passing return_alignments=True
            print(chosen_text)
            text_red = [chosen_text]
            specs = synthesizer.synthesize_spectrograms(text_red, embeds)
            spec = specs[0]
            print("Created the mel spectrogram")

            ## Generating the waveform
            print("Synthesizing the waveform:")
            # Synthesizing the waveform is fairly straightforward. Remember that the longer the
            # spectrogram, the more time-efficient the vocoder.
            generated_wav = vocoder.infer_waveform(spec)

            ## Post-generation
            # There's a bug with sounddevice that makes the audio cut one second earlier, so we
            # pad it.
            generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                                   mode="constant")
            print("synthesizer.sample_rad: {}".format(synthesizer.sample_rate))
    texts = list(re.split("[!.] ", text))

    for text in texts:
        try:

            # Then we derive the embedding. There are many functions and parameters that the
            # speaker encoder interfaces. These are mostly for in-depth research. You will typically
            # only use this function (with its default parameters):

            embeds = [embed]
            # If you know what the attention layer alignments are, you can retrieve them here by
            # passing return_alignments=True
            print([' '.join(text)])

            specs = synthesizer.synthesize_spectrograms([text], embeds)
            spec = specs[0]
            print("Created the mel spectrogram")

            ## Generating the waveform
            print("Synthesizing the waveform:")

            # If seed is specified, reset torch seed and reload vocoder
            if args.seed is not None:
                torch.manual_seed(args.seed)
                vocoder.load_model(args.voc_model_fpath)

            # Synthesizing the waveform is fairly straightforward. Remember that the longer the
            # spectrogram, the more time-efficient the vocoder.
            generated_wav = vocoder.infer_waveform(spec)
def hello_world():
    legoutput = upload_file()
    lig = "This is a demo utterance. This will work when you do not add any utterance."
    if request.method == 'POST':
        lig = request.form["textarea"]
    print(str(lig))
    #return mainpage()
    if str(legoutput) == "None":
        return render_template("index.html", output="")
    else:
        from encoder.params_model import model_embedding_size as speaker_embedding_size
        from utils.argutils import print_args
        from synthesizer.inference import Synthesizer
        from encoder import inference as encoder
        from vocoder import inference as vocoder
        from pathlib import Path
        import numpy as np
        import librosa
        import argparse
        import torch
        try:
            parser = argparse.ArgumentParser(
                formatter_class=argparse.ArgumentDefaultsHelpFormatter)
            parser.add_argument("-e",
                                "--enc_model_fpath",
                                type=Path,
                                default="encoder/saved_models/pretrained.pt")
            parser.add_argument(
                "-s",
                "--syn_model_dir",
                type=Path,
                default="synthesizer/saved_models/logs-pretrained/")
            parser.add_argument(
                "-v",
                "--voc_model_fpath",
                type=Path,
                default="vocoder/saved_models/pretrained/pretrained.pt")
            parser.add_argument("--low_mem", action="store_true")
            #parser.add_argument("--no_sound", action="store_true")
            args = parser.parse_args()
            print_args(args, parser)
            #if not args.no_sound:
            #    import sounddevice as sd
            encoder.load_model(args.enc_model_fpath)
            synthesizer = Synthesizer(
                args.syn_model_dir.joinpath("taco_pretrained"),
                low_mem=args.low_mem)
            vocoder.load_model(args.voc_model_fpath)
            num_generated = 0
            in_fpath = legoutput[1]
            print(str(in_fpath))
            preprocessed_wav = encoder.preprocess_wav(in_fpath)
            original_wav, sampling_rate = librosa.load(in_fpath)
            preprocessed_wav = encoder.preprocess_wav(original_wav,
                                                      sampling_rate)
            embed = encoder.embed_utterance(preprocessed_wav)
            print("Created the embedding")
            text = str(lig)
            texts = [text]
            embeds = [embed]
            specs = synthesizer.synthesize_spectrograms(texts, embeds)
            spec = specs[0]
            print("Created the mel spectrogram")
            print("Synthesizing the waveform:")
            generated_wav = vocoder.infer_waveform(spec)
            generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                                   mode="constant")
            #if not args.no_sound:
            #    sd.stop()
            #    sd.play(generated_wav, synthesizer.sample_rate)
            fpath = "static/output.wav"
            print(generated_wav.dtype)
            librosa.output.write_wav(fpath, generated_wav.astype(np.float32),
                                     synthesizer.sample_rate)
            print("\nSaved output as %s\n\n" % fpath)
            return render_template("index.html",
                                   output=htmloader(text, legoutput[1], fpath))
        except Exception as e:
            return render_template("index.html",
                                   output="Caught exception: %s" % repr(e))
Beispiel #10
0
class VoiceCloner:
    def __init__(self):
        # Info & args
        enc_model_fpath = Path("encoder/saved_models/pretrained.pt")

        syn_model_dir = Path("synthesizer/saved_models/logs-pretrained/")
        voc_model_fpath = Path("vocoder/saved_models/pretrained/pretrained.pt")
        low_mem = False

        ## Load the models one by one.
        print("Preparing the encoder, the synthesizer and the vocoder...")
        encoder.load_model(enc_model_fpath)
        self.synthesizer = Synthesizer(syn_model_dir.joinpath("taco_pretrained"), low_mem=low_mem)
        vocoder.load_model(voc_model_fpath)

    def gen_audio(self, ref_audio, text):
        try:
            in_fpath = Path(ref_audio.replace("\"", "").replace("\'", ""))

            ## Computing the embedding
            # First, we load the wav using the function that the speaker encoder provides. This is
            # important: there is preprocessing that must be applied.

            # The following two methods are equivalent:
            # - Directly load from the filepath:
            # preprocessed_wav = encoder.preprocess_wav(in_fpath)
            # - If the wav is already loaded:
            original_wav, sampling_rate = librosa.load(in_fpath, sr=None)
            preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
            print("Loaded file succesfully, rate=%s" % sampling_rate)

            # Then we derive the embedding. There are many functions and parameters that the
            # speaker encoder interfaces. These are mostly for in-depth research. You will typically
            # only use this function (with its default parameters):
            embed = encoder.embed_utterance(preprocessed_wav)
            print("Created the embedding")

            ## Generating the spectrogram
            # The synthesizer works in batch, so you need to put your data in a list or numpy array
            texts = [text]
            embeds = [embed]
            # If you know what the attention layer alignments are, you can retrieve them here by
            # passing return_alignments=True
            specs = self.synthesizer.synthesize_spectrograms(texts, embeds)
            spec = specs[0]
            print("Created the mel spectrogram")

            ## Generating the waveform
            print("Synthesizing the waveform:")
            # Synthesizing the waveform is fairly straightforward. Remember that the longer the
            # spectrogram, the more time-efficient the vocoder.
            generated_wav = vocoder.infer_waveform(spec)

            ## Post-generation
            # There's a bug with sounddevice that makes the audio cut one second earlier, so we
            # pad it.
            # generated_wav = np.pad(generated_wav, (0, self.synthesizer.sample_rate), mode="constant")
            print("\n samples = %s @ %s" % (len(generated_wav), self.synthesizer.sample_rate))

            return generated_wav

        except Exception as e:
            traceback.print_exc()
            print("Caught exception: %s" % repr(e))
            print("Restarting\n")
Beispiel #11
0
                              np.random.choice(smpl_files,
                                               size=num_sample_per_spk))
                             for spk, smpl_files in spk_sample_files_pair]

    # encode the samples
    print('encoding samples, this might take a while...')
    spk_sample_embed_pair = [
        (spk, [get_embeddings_from_wav(x, encoder) for x in smpl_files])
        for spk, smpl_files in spk_sample_files_pair
    ]
    # process generated speech #######################################################################################
    # sample sentence to be synthesized
    text = 'The most merciful thing in the world, I think, is the inability of the human mind to correlate all its contents.'
    # inference LPC spectrogram based on input text and speaker embeddings
    specs = synthesizer.synthesize_spectrograms(
        [text] * num_sample_per_spk * num_speaker,
        [item for _, embeds in spk_sample_embed_pair for item in embeds])
    # vocode from the inferred LPC
    print('Vocoding, this might take a while...')
    generated_wavs = [vocoder.infer_waveform(spec) for spec in specs]
    # pad the synthesized wav so be the same length as
    generated_wavs_padded = [
        np.pad(g_wav, (0, synthesizer.sample_rate), mode="constant")
        for g_wav in generated_wavs
    ]
    # evaluate the speaker embeddings of the generated wav
    print('encoding generated samples, this might take a while...')
    embeds_generated = [
        encoder.embed_utterance(g_wav_padded)
        for g_wav_padded in generated_wavs_padded
    ]
Beispiel #12
0
def run_DeepTalk_demo(ref_audio_path='samples/ref_VCTKp240.wav',
                      output_text='Hello World',
                      enc_model_fpath=config.enc_model_fpath,
                      enc_module_name=config.enc_module_name,
                      syn_model_dir=config.syn_model_dir,
                      voc_model_fpath=config.voc_model_fpath,
                      key_embed=None):
    class hyperparameter:
        def __init__(self):

            self.enc_model_fpath = enc_model_fpath
            self.enc_module_name = enc_module_name
            self.syn_model_dir = syn_model_dir
            self.voc_model_fpath = voc_model_fpath

            self.enc_normalize = False
            self.voc_normalize = True
            self.low_mem = False  # "If True, the memory used by the synthesizer will be freed after each use. Adds large "
            # "overhead but allows to save some GPU memory for lower-end GPUs."
            self.no_sound = False  # If True, audio won't be played.
            self.sampling_rate = 16000  ## 16000: For mel-spectrogram based methods; 8000: For fCNN base methods
            self.ref_audio_path = ref_audio_path
            self.output_text = output_text

    args = hyperparameter()

    ## Load trained models: Encoder, Synthesizer, and Vocoder
    # os.environ["CUDA_VISIBLE_DEVICES"] = '0'
    encoder.load_model(args.enc_model_fpath, module_name=args.enc_module_name)
    synthesizer = Synthesizer(args.syn_model_dir, low_mem=args.low_mem)
    vocoder.load_model(args.voc_model_fpath)

    ## Encoding stage
    print('---------------------------------------------------------------')
    print('Stage 1/3: Encoder')
    print('---------------------------------------------------------------')
    wav = Synthesizer.load_preprocess_wav(args.ref_audio_path)
    ref_audio = encoder.preprocess_wav(wav)

    embed, partial_embeds, _ = encoder.embed_utterance(ref_audio,
                                                       using_partials=True,
                                                       return_partials=True,
                                                       key_embed=key_embed)
    if (args.enc_normalize):
        embed = embed / np.linalg.norm(embed)

    if (embed.shape[0] == 128):
        embed = np.concatenate((embed, embed), axis=0)

    ## Synthesizing stage
    print('---------------------------------------------------------------')
    print('Stage 2/3: Synthesizer')
    print('---------------------------------------------------------------')
    texts = args.output_text
    # texts = re.split(',|.',texts)
    texts = re.split(r'[,.]\s*', texts)
    texts[:] = [x for x in texts if x]
    print(texts)
    # texts = texts.split("\n")
    # texts = texts.split(".")
    # texts = texts.split(",")
    embeds = np.stack([embed] * len(texts))
    specs = synthesizer.synthesize_spectrograms(texts, embeds)
    breaks = [spec.shape[1] for spec in specs]
    synthesized_mel = np.concatenate(specs, axis=1)

    ## Vocoding stage
    print('---------------------------------------------------------------')
    print('Stage 3/3: Vocoder')
    print('---------------------------------------------------------------')
    no_action = lambda *args: None
    wav1 = vocoder.infer_waveform(synthesized_mel,
                                  progress_callback=no_action,
                                  normalize=args.voc_normalize)
    # Add breaks
    b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
    b_starts = np.concatenate(([0], b_ends[:-1]))
    wavs = [wav1[start:end] for start, end, in zip(b_starts, b_ends)]
    breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
    wav1 = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
    synthesized_wav = wav1 / np.abs(wav1).max() * 0.97

    return synthesized_wav, Synthesizer.sample_rate, embed
Beispiel #13
0
class Cloner:
    def __init__(self, encoder_model_path, synthesizer_model_path,
                 vocoder_model_path):
        print("Preparing the encoder, the synthesizer and the vocoder...")
        self.encoder = encoder
        # self.encoder.load_model(Path(encoder_model_path))
        self.synthesizer = Synthesizer(
            Path(synthesizer_model_path).joinpath("taco_pretrained"),
            low_mem=False)
        self.vocoder = vocoder
        self.vocoder.load_model(Path(vocoder_model_path))

    def synthesize_embeds(self, audio_seq_path, speaker, synthesizer_root,
                          encoder_model_path, n_processes, embed_config_path):

        # Preprocess the dataset
        with open(embed_config_path, "r") as handle:
            embeddings_metadata = json.load(handle)

        embeddings = create_embeddings(audio_seq_path, speaker,
                                       synthesizer_root, encoder_model_path,
                                       n_processes)
        meta = []

        if embeddings:
            with open(embed_config_path, "w") as handle:
                for embed in embeddings:
                    em_id = str(uuid4())
                    embeddings_metadata[em_id] = {
                        'audio_path': str(embed[0]),
                        'embed_path': str(embed[1]),
                        'seq_length': embed[2],
                        'speaker': speaker
                    }
                    meta.append({'embed_id': em_id, 'seq_length': embed[2]})

                json.dump(embeddings_metadata, handle)

        return meta

    def generate_audio(self, embed_path, texts, n_process, save_path):
        # preprocessed_wav = self.encoder.preprocess_wav(embed_path)
        # embed = encoder.embed_utterance(preprocessed_wav)
        # print(embed.shape)
        embed = np.load(embed_path)
        # text_len = ((len(texts) / n_process) * n_process) + (len(texts) % n_process) + n_process - (len(texts) % n_process)

        generated_wav = None
        for i in range(0, len(texts), n_process):
            embeds = np.stack([embed] * len(texts[i:i + n_process]))
            print(texts[i:i + n_process], embeds.shape)
            specs = self.synthesizer.synthesize_spectrograms(
                texts[i:i + n_process], embeds)
            breaks = [spec.shape[1] for spec in specs]
            print("breaks: ", breaks)
            spec = np.concatenate(specs, axis=1)

            wav = self.vocoder.infer_waveform(spec)
            # Add breaks
            b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
            print("b_ends: ", b_ends)
            b_starts = np.concatenate(([0], b_ends[:-1]))
            print("b_starts: ", b_starts)
            wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
            breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))
                      ] * len(breaks)
            print("final breaks: ", breaks)
            wav = np.concatenate(
                [i for w, b in zip(wavs, breaks) for i in (w, b)])

            if generated_wav is None:
                generated_wav = wav
            else:
                generated_wav = np.concatenate((generated_wav, wav))

        del embed
        del embeds
        del wav
        del wavs

        if generated_wav is not None:
            # Save it on the disk
            print(generated_wav.dtype)
            librosa.output.write_wav(save_path,
                                     generated_wav.astype(np.float32),
                                     self.synthesizer.sample_rate)
            print("\nSaved output as %s\n\n" % save_path)

            del generated_wav

            return True

        return False
from IPython.display import Audio
from IPython.utils import io
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder
from pathlib import Path
import numpy as np
import librosa
encoder_weights = Path("encoder/saved_models/pretrained.pt")
vocoder_weights = Path("vocoder/saved_models/pretrained/pretrained.pt")
syn_dir = Path("synthesizer/saved_models/logs-train_ppg/taco_pretrained")
encoder.load_model(encoder_weights)
synthesizer = Synthesizer(syn_dir)
vocoder.load_model(vocoder_weights)

ppg = np.load('/data/AutoSpeech/vc/LibriSpeech/SV2TTS/synthesizer/ppgs/ppg-61-70968-0000_00.npy')
embed = np.load('/data/AutoSpeech/vc/LibriSpeech/SV2TTS/synthesizer/embeds/ppg-908-31957-0015_00.npy')
specs = synthesizer.synthesize_spectrograms([ppg], [embed])
Beispiel #15
0
encoder.embed_utterance(np.zeros(encoder.sampling_rate))

# Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
# returns, but here we're going to make one ourselves just for the sake of showing that it's
# possible.
embed = np.random.rand(speaker_embedding_size)
# Embeddings are L2-normalized (this isn't important here, but if you want to make your own
# embeddings it will be).
embed /= np.linalg.norm(embed)
# The synthesizer can handle multiple inputs with batching. Let's create another embedding to
# illustrate that
embeds = [embed, np.zeros(speaker_embedding_size)]
print(embeds)
text_arr = ["test 1", "test 2"]
print("\tTesting the synthesizer... (loading the model will output a lot of text)")
mels = synthesizer.synthesize_spectrograms(text_arr, embeds)

# The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
# can concatenate the mel spectrograms to a single one.
mel = np.concatenate(mels, axis=1)
# The vocoder can take a callback function to display the generation. More on that later. For
# now we'll simply hide it like this:
no_action = lambda *args: None
print("\tTesting the vocoder...")
# For the sake of making this test short, we'll pass a short target length. The target length
# is the length of the wav segments that are processed in parallel. E.g. for audio sampled
# at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
# 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
# that has a detrimental effect on the quality of the audio. The default parameters are
# recommended in general.
vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
def maux(output_text, num):

    print("debug -- django")

    ## Info & args
    # parser = argparse.ArgumentParser(
    #     formatter_class=argparse.ArgumentDefaultsHelpFormatter
    # )
    # parser.add_argument("-e", "--enc_model_fpath", type=Path,
    #                     default="D:/RemindMe/django-remindme/mysite/trained model/encoder/saved_models/pretrained.pt",
    #                     help="Path to a saved encoder")
    # parser.add_argument("-s", "--syn_model_dir", type=Path,
    #                     default="D:/RemindMe/django-remindme/mysite/trained model/synthesizer/saved_models/logs-pretrained/",
    #                     help="Directory containing the synthesizer model")
    # parser.add_argument("-v", "--voc_model_fpath", type=Path,
    #                     default="D:/RemindMe/django-remindme/mysite/trained model/vocoder/saved_models/pretrained/pretrained.pt",
    #                     help="Path to a saved vocoder")
    # parser.add_argument("--low_mem", action="store_true", help=\
    #     "If True, the memory used by the synthesizer will be freed after each use. Adds large "
    #     "overhead but allows to save some GPU memory for lower-end GPUs.")
    # parser.add_argument("--no_sound", action="store_true", help=\
    #     "If True, audio won't be played.")
    # args = parser.parse_args()
    # print_args(args, parser)
    # if not args.no_sound:
    #     import sounddevice as sd

    ## Print some environment information (for debugging purposes)
    print("Running a test of your configuration...\n")
    if not torch.cuda.is_available():
        print(
            "Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
            "for deep learning, ensure that the drivers are properly installed, and that your "
            "CUDA version matches your PyTorch installation. CPU-only inference is currently "
            "not supported.",
            file=sys.stderr)
        quit(-1)
    device_id = torch.cuda.current_device()
    gpu_properties = torch.cuda.get_device_properties(device_id)
    print(
        "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
        "%.1fGb total memory.\n" %
        (torch.cuda.device_count(), device_id, gpu_properties.name,
         gpu_properties.major, gpu_properties.minor,
         gpu_properties.total_memory / 1e9))

    ## Load the models one by one.
    print("Preparing the encoder, the synthesizer and the vocoder...")
    #encoder.load_model(args.enc_model_fpath)
    #synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem)
    #vocoder.load_model(args.voc_model_fpath)
    encoder.load_model(
        "D:/RemindMe/django-remindme/mysite/trained model/encoder/saved_models/pretrained.pt"
    )

    synthesizer = Synthesizer(
        "D:/RemindMe/django-remindme/mysite/trained model/synthesizer/saved_models/logs-pretrained/taco_pretrained",
        low_mem=False)

    vocoder.load_model(
        "D:/RemindMe/django-remindme/mysite/trained model/vocoder/saved_models/pretrained/pretrained.pt"
    )

    ## Run a test
    print("Testing your configuration with small inputs.")
    # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
    # sampling rate, which may differ.
    # If you're unfamiliar with digital audio, know that it is encoded as an array of floats
    # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
    # The sampling rate is the number of values (samples) recorded per second, it is set to
    # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
    # to an audio of 1 second.
    print("\tTesting the encoder...")
    encoder.embed_utterance(np.zeros(encoder.sampling_rate))

    # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
    # returns, but here we're going to make one ourselves just for the sake of showing that it's
    # possible.
    embed = np.random.rand(speaker_embedding_size)
    # Embeddings are L2-normalized (this isn't important here, but if you want to make your own
    # embeddings it will be).
    embed /= np.linalg.norm(embed)
    # The synthesizer can handle multiple inputs with batching. Let's create another embedding to
    # illustrate that
    embeds = [embed, np.zeros(speaker_embedding_size)]
    texts = ["test 1", "test 2"]
    print(
        "\tTesting the synthesizer... (loading the model will output a lot of text)"
    )
    mels = synthesizer.synthesize_spectrograms(texts, embeds)

    # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
    # can concatenate the mel spectrograms to a single one.
    mel = np.concatenate(mels, axis=1)
    # The vocoder can take a callback function to display the generation. More on that later. For
    # now we'll simply hide it like this:
    no_action = lambda *args: None
    print("\tTesting the vocoder...")
    # For the sake of making this test short, we'll pass a short target length. The target length
    # is the length of the wav segments that are processed in parallel. E.g. for audio sampled
    # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
    # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
    # that has a detrimental effect on the quality of the audio. The default parameters are
    # recommended in general.
    vocoder.infer_waveform(mel,
                           target=200,
                           overlap=50,
                           progress_callback=no_action)

    print("All test passed! You can now synthesize speech.\n\n")

    ## Interactive speech generation
    print(
        "This is a GUI-less example of interface to SV2TTS. The purpose of this script is to "
        "show how you can interface this project easily with your own. See the source code for "
        "an explanation of what is happening.\n")

    print("Interactive generation loop")

    in_fpath = Path(
        "D:/RemindMe/django-remindme/mysite/trained model/sam_narration2.wav")
    preprocessed_wav = encoder.preprocess_wav(in_fpath)
    original_wav, sampling_rate = librosa.load(in_fpath)
    preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
    print("Loaded file succesfully")
    embed = encoder.embed_utterance(preprocessed_wav)
    print("Created the embedding")
    embeds = [embed]

    text = output_text
    texts = [text]

    specs = synthesizer.synthesize_spectrograms(texts, embeds)
    spec = specs[0]
    print("Created the mel spectrogram")

    ## Generating the waveform
    print("Synthesizing the waveform:")

    # Synthesizing the waveform is fairly straightforward. Remember that the longer the
    # spectrogram, the more time-efficient the vocoder.
    generated_wav = vocoder.infer_waveform(spec)

    ## Post-generation
    # There's a bug with sounddevice that makes the audio cut one second earlier, so we
    # pad it.
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                           mode="constant")

    # Play the audio (non-blocking)

    # Save it on the disk
    filexpath = "D:/RemindMe/django_remindme_model/mysite/media/demo_output_%02d.wav" % num
    fx = "demo_output_%02d" % num
    print(generated_wav.dtype)
    librosa.output.write_wav(filexpath, generated_wav.astype(np.float32),
                             synthesizer.sample_rate)

    print("\nSaved output as %s\n\n" % filexpath)

    return fx
Beispiel #17
0
class RtvcBackend:

    singleton = None

    def init(self):
        reporoot = Path('/workspace/rtvc')
        enc_model_fpath = reporoot.joinpath(
            'encoder/saved_models/pretrained.pt')
        voc_model_fpath = reporoot.joinpath(
            'vocoder/saved_models/pretrained/pretrained.pt')
        syn_model_dir = reporoot.joinpath(
            'synthesizer/saved_models/logs-pretrained')

        if not torch.cuda.is_available():
            print(
                "Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
                "for deep learning, ensure that the drivers are properly installed, and that your "
                "CUDA version matches your PyTorch installation. CPU-only inference is currently "
                "not supported.",
                file=sys.stderr)
            return False
        device_id = torch.cuda.current_device()
        gpu_properties = torch.cuda.get_device_properties(device_id)
        print(
            "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
            "%.1fGb total memory.\n" %
            (torch.cuda.device_count(), device_id, gpu_properties.name,
             gpu_properties.major, gpu_properties.minor,
             gpu_properties.total_memory / 1e9))

        ## Load the models one by one.
        print("Preparing the encoder, the synthesizer and the vocoder...")
        encoder.load_model(enc_model_fpath)
        self.synthesizer = Synthesizer(
            syn_model_dir.joinpath("taco_pretrained"), low_mem=False)
        vocoder.load_model(voc_model_fpath)
        ## Run a test
        print("Testing your configuration with small inputs.")
        # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
        # sampling rate, which may differ.
        # If you're unfamiliar with digital audio, know that it is encoded as an array of floats
        # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
        # The sampling rate is the number of values (samples) recorded per second, it is set to
        # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
        # to an audio of 1 second.
        print("\tTesting the encoder...")
        encoder.embed_utterance(np.zeros(encoder.sampling_rate))

        # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
        # returns, but here we're going to make one ourselves just for the sake of showing that it's
        # possible.
        embed = np.random.rand(speaker_embedding_size)
        # Embeddings are L2-normalized (this isn't important here, but if you want to make your own
        # embeddings it will be).
        embed /= np.linalg.norm(embed)
        # The synthesizer can handle multiple inputs with batching. Let's create another embedding to
        # illustrate that
        embeds = [embed, np.zeros(speaker_embedding_size)]
        texts = ["test 1", "test 2"]
        print(
            "\tTesting the synthesizer... (loading the model will output a lot of text)"
        )
        mels = self.synthesizer.synthesize_spectrograms(texts, embeds)

        # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
        # can concatenate the mel spectrograms to a single one.
        mel = np.concatenate(mels, axis=1)
        # The vocoder can take a callback function to display the generation. More on that later. For
        # now we'll simply hide it like this:
        no_action = lambda *args: None
        print("\tTesting the vocoder...")
        # For the sake of making this test short, we'll pass a short target length. The target length
        # is the length of the wav segments that are processed in parallel. E.g. for audio sampled
        # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
        # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
        # that has a detrimental effect on the quality of the audio. The default parameters are
        # recommended in general.
        vocoder.infer_waveform(mel,
                               target=200,
                               overlap=50,
                               progress_callback=no_action)

        print("All test passed! You can now synthesize speech.\n\n")

        return True

    #TODO: use sox python lib?
    #def silence_removal(wav_fn):
    #  tfm = sox.Transformer()
    #  array_out = tfm.build_array(input_filepath=wav_fn)
    #  return self._silence_removal(array_out, tfm)

    #def silence_removal_array(self, wav, tfm=None):
    #  tfm = tfm or sox.Transformer()
    #  y_out = tfm.build_array(input_array=wav, sample_rate_in=sample_rate)
    #  tfm.set_output_format(rate=22050)
    #  return tfm.build_array(input_array=y_out, sample_rate_in=sample_rate)

    #  sox.transform()
    #  print(f"remove silence from {wav_fn}")

    #def convert_array(self, wav, text):
    #  pass

    def convert(self, text, in_fpath, outfn):
        print(f"converting\ntext:\n {text}\n\n wavfn: {in_fpath}")
        print(f"outfn: {outfn}")

        try:
            preprocessed_wav = encoder.preprocess_wav(in_fpath)
            original_wav, sampling_rate = librosa.load(in_fpath)
            preprocessed_wav = encoder.preprocess_wav(original_wav,
                                                      sampling_rate)
            print("Loaded file succesfully")

            embed = encoder.embed_utterance(preprocessed_wav)
            print("Created the embedding")
            # The synthesizer works in batch, so you need to put your data in a list or numpy array
            texts = [text]
            embeds = [embed]
            # If you know what the attention layer alignments are, you can retrieve them here by
            # passing return_alignments=True
            specs = self.synthesizer.synthesize_spectrograms(texts, embeds)
            spec = specs[0]
            print("Created the mel spectrogram")

            ## Generating the waveform
            print("Synthesizing the waveform:")
            # Synthesizing the waveform is fairly straightforward. Remember that the longer the
            # spectrogram, the more time-efficient the vocoder.
            generated_wav = vocoder.infer_waveform(spec)

            #TODO: check this necessary
            generated_wav = np.pad(generated_wav,
                                   (0, self.synthesizer.sample_rate),
                                   mode="constant")

            #TODO: Save it on the disk?
            #fpath = "demo_output_%02d.wav" % num_generated
            print(generated_wav.dtype)
            librosa.output.write_wav(outfn, generated_wav.astype(np.float32),
                                     self.synthesizer.sample_rate)
            #num_generated += 1
            print("\nSaved output as %s\n\n" % outfn)
            return True
        except Exception as e:
            print("Caught exception: %s" % repr(e))
            return False
Beispiel #18
0
class Toolbox:
    def __init__(self, datasets_root, enc_models_dir, syn_models_dir, voc_models_dir, toolbox_files_dir, low_mem):
        sys.excepthook = self.excepthook

        self._out_dir = Path(toolbox_files_dir)
        self.make_out_dirs()

        self.datasets_root = datasets_root
        self.low_mem = low_mem
        self.utterances = set()
        self.current_generated = (None, None, None, None)  # speaker_name, spec, breaks, wav

        self.synthesizer = None  # type: Synthesizer

        # Initialize the events and the interface
        self.ui = UI()
        self.reset_ui(enc_models_dir, syn_models_dir, voc_models_dir)
        self.setup_events()
        self.ui.start()

    def make_out_dirs(self):
        self._out_dir.mkdir(exist_ok=True)

        self._out_mel_dir = self._out_dir.joinpath('mels')
        self._out_mel_dir.mkdir(exist_ok=True)

        self._out_wav_dir = self._out_dir.joinpath('wavs')
        self._out_wav_dir.mkdir(exist_ok=True)

        self._out_embed_dir = self._out_dir.joinpath('embeds')
        self._out_embed_dir.mkdir(exist_ok=True)

        self._out_record_dir = self._out_dir.joinpath('records')
        self._out_record_dir.mkdir(exist_ok=True)

    def excepthook(self, exc_type, exc_value, exc_tb):
        traceback.print_exception(exc_type, exc_value, exc_tb)
        self.ui.log("Exception: %s" % exc_value)

    def setup_events(self):
        # Dataset, speaker and utterance selection
        self.ui.browser_load_button.clicked.connect(lambda: self.load_from_browser())
        random_func = lambda level: lambda: self.ui.populate_browser(self.datasets_root,
                                                                     recognized_datasets,
                                                                     level)
        text_func = lambda: self.ui.text_prompt.setPlainText(np.random.choice(total_texts, 1)[0])
        self.ui.random_dataset_button.clicked.connect(text_func)
        self.ui.random_speaker_button.clicked.connect(random_func(1))
        self.ui.random_utterance_button.clicked.connect(random_func(2))
        self.ui.dataset_box.currentIndexChanged.connect(random_func(1))
        self.ui.speaker_box.currentIndexChanged.connect(random_func(2))

        # Model selection
        self.ui.encoder_box.currentIndexChanged.connect(self.init_encoder)

        def func():
            self.synthesizer = None

        self.ui.synthesizer_box.currentIndexChanged.connect(func)
        self.ui.vocoder_box.currentIndexChanged.connect(self.init_vocoder)

        # Utterance selection
        func = lambda: self.load_from_browser(self.ui.browse_file())
        self.ui.browser_browse_button.clicked.connect(func)
        func = lambda: self.ui.draw_utterance(self.ui.selected_utterance, "current")
        self.ui.utterance_history.currentIndexChanged.connect(func)
        func = lambda: self.ui.play(self.ui.selected_utterance.wav, Synthesizer.sample_rate)
        self.ui.play_button.clicked.connect(func)
        self.ui.stop_button.clicked.connect(self.ui.stop)
        self.ui.record_button.clicked.connect(self.record)

        # Generation
        func = lambda: self.synthesize() or self.vocode()
        self.ui.generate_button.clicked.connect(func)
        self.ui.synthesize_button.clicked.connect(self.synthesize)
        self.ui.vocode_button.clicked.connect(self.vocode)

        # UMAP legend
        self.ui.clear_button.clicked.connect(self.clear_utterances)

    def reset_ui(self, encoder_models_dir, synthesizer_models_dir, vocoder_models_dir):
        self.ui.populate_browser(self.datasets_root, recognized_datasets, 0, True)
        self.ui.populate_models(encoder_models_dir, synthesizer_models_dir, vocoder_models_dir)

    def load_from_browser(self, fpath=None):
        if fpath is None:
            fpath = Path(self.datasets_root,
                         self.ui.current_dataset_name,
                         self.ui.current_speaker_name,
                         self.ui.current_utterance_name)
            # name = '-'.join(fpath.relative_to(self.datasets_root).parts)
            speaker_name = "-".join((self.ui.current_dataset_name.replace("\\", "_").replace("/", "_"),
                                     self.ui.current_speaker_name.replace("\\", "_").replace("/", "_")))
            name = "-".join((speaker_name, self.ui.current_utterance_name.replace("\\", "_").replace("/", "_")))
            # name = '-'.join(fpath.relative_to(self.datasets_root.joinpath(self.ui.current_dataset_name)).parts)
            # speaker_name = self.ui.current_speaker_name.replace("\\", "-").replace("/", "-")
            # Select the next utterance
            if self.ui.auto_next_checkbox.isChecked():
                self.ui.browser_select_next()
        elif fpath == "":
            return
        else:
            name = fpath.name
            speaker_name = fpath.parent.name

        # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
        # playback, so as to have a fair comparison with the generated audio
        wav = Synthesizer.load_preprocess_wav(fpath)
        self.ui.log("Loaded %s" % name)

        self.add_real_utterance(wav, name, speaker_name)

    def record(self):
        wav = self.ui.record_one(encoder.sampling_rate, 5)
        if wav is None:
            return

        self.ui.play(wav, encoder.sampling_rate)

        speaker_name = "user01"
        name = speaker_name + "_rec_%d" % int(time.time())
        audio.save_wav(wav, self._out_record_dir.joinpath(name + '.wav'), encoder.sampling_rate)  # save

        self.add_real_utterance(wav, name, speaker_name)

    def add_real_utterance(self, wav, name, speaker_name):
        # Compute the mel spectrogram
        spec = Synthesizer.make_spectrogram(wav)
        self.ui.draw_spec(spec, "current")

        # Compute the embedding
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)

        np.save(self._out_embed_dir.joinpath(name + '.npy'), embed, allow_pickle=False)  # save

        # Add the utterance
        utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, False)
        self.utterances.add(utterance)
        self.ui.register_utterance(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "current")
        self.ui.draw_umap_projections(self.utterances)

    def clear_utterances(self):
        self.utterances.clear()
        self.ui.draw_umap_projections(self.utterances)

    def synthesize(self):
        self.ui.log("Generating the mel spectrogram...")
        self.ui.set_loading(1)

        # Synthesize the spectrogram
        if self.synthesizer is None:
            model_dir = self.ui.current_synthesizer_model_dir
            checkpoints_dir = model_dir.joinpath("taco_pretrained")
            self.synthesizer = Synthesizer(checkpoints_dir, low_mem=self.low_mem)
        if not self.synthesizer.is_loaded():
            self.ui.log("Loading the synthesizer %s" % self.synthesizer.checkpoint_fpath)

        ptext = self.ui.text_prompt.toPlainText()
        # if ptext.startswith("py"):  # 适用于sync2,适应训练时候用pinyin+chinese_cleaners的bug
        #     ptext = get_pinyin(ptext[2:])  # 把chinese_cleaners的lowercase用起来,否则不能合成。
        ptext = " ".join(text2pinyin(ptext))
        texts = ptext.split("\n")
        print(dict(texts=texts))
        embed = self.ui.selected_utterance.embed
        embeds = np.stack([embed] * len(texts))
        specs = self.synthesizer.synthesize_spectrograms(texts, embeds)

        # 去除前后安静或噪声部分
        for num, spec in enumerate(specs):
            tmp = spec.T
            sidx, eidx = find_start_end_points(tmp)
            specs[num] = tmp[sidx:eidx].T

        # specs = [spec.T[:find_endpoint(spec.T)].T for spec in specs]  # find endpoint
        breaks = [spec.shape[1] for spec in specs]
        spec = np.concatenate(specs, axis=1)

        fref = '-'.join([self.ui.current_dataset_name, self.ui.current_speaker_name, self.ui.current_utterance_name])
        ftext = '。'.join(texts)
        ftime = '{}'.format(int(time.time()))
        fname = filename_formatter('{}_{}_{}zi_{}.npy'.format(fref, ftime, len(ftext), ftext))
        np.save(self._out_mel_dir.joinpath(fname), spec, allow_pickle=False)  # save

        self.ui.draw_spec(spec, "generated")
        self.current_generated = (self.ui.selected_utterance.speaker_name, spec, breaks, None)
        self.ui.set_loading(0)

    def vocode(self):
        speaker_name, spec, breaks, _ = self.current_generated
        assert spec is not None

        # Synthesize the waveform
        if not vocoder.is_loaded():
            self.init_vocoder()

        def vocoder_progress(i, seq_len, b_size, gen_rate):
            real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
            line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
                   % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
            self.ui.log(line, "overwrite")
            self.ui.set_loading(i, seq_len)

        if self.ui.current_vocoder_fpath is not None:
            self.ui.log("")
            wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress)
        else:
            self.ui.log("Waveform generation with Griffin-Lim... ")
            wav = Synthesizer.griffin_lim(spec)
        self.ui.set_loading(0)
        self.ui.log(" Done!", "append")

        # Add breaks
        b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
        b_starts = np.concatenate(([0], b_ends[:-1]))
        wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
        breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
        wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])

        # Play it
        wav = wav / np.abs(wav).max() * 0.97
        self.ui.play(wav, Synthesizer.sample_rate)

        fref = '-'.join([self.ui.current_dataset_name, self.ui.current_speaker_name, self.ui.current_utterance_name])
        ftime = '{}'.format(int(time.time()))
        ftext = self.ui.text_prompt.toPlainText()
        fms = int(len(wav) * 1000 / Synthesizer.sample_rate)
        fname = filename_formatter('{}_{}_{}ms_{}.wav'.format(fref, ftime, fms, ftext))
        audio.save_wav(wav, self._out_wav_dir.joinpath(fname), Synthesizer.sample_rate)  # save

        # Compute the embedding
        # TODO: this is problematic with different sampling rates, gotta fix it
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)

        # Add the utterance
        name = speaker_name + "_gen_%05d" % int(time.time())
        utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True)

        np.save(self._out_embed_dir.joinpath(name + '.npy'), embed, allow_pickle=False)  # save

        self.utterances.add(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "generated")
        self.ui.draw_umap_projections(self.utterances)

    def init_encoder(self):
        model_fpath = self.ui.current_encoder_fpath

        self.ui.log("Loading the encoder %s... " % model_fpath)
        self.ui.set_loading(1)
        start = timer()
        encoder.load_model(model_fpath)
        self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
        self.ui.set_loading(0)

    def init_vocoder(self):
        model_fpath = self.ui.current_vocoder_fpath
        # Case of Griffin-lim
        if model_fpath is None:
            return

        self.ui.log("Loading the vocoder %s... " % model_fpath)
        self.ui.set_loading(1)
        start = timer()
        vocoder.load_model(model_fpath)
        self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
        self.ui.set_loading(0)
Beispiel #19
0
class Toolbox:
    def __init__(self, datasets_root, enc_models_dir, syn_models_dir, voc_models_dir, toolbox_files_dir, low_mem):
        sys.excepthook = self.excepthook

        self._out_dir = Path(toolbox_files_dir)
        self.make_out_dirs()

        self.datasets_root = datasets_root
        self.datasets = [p.name for p in Path(datasets_root).glob("*") if p.is_dir()]

        metapath = Path(self.datasets_root).joinpath("metadata.csv")
        if metapath.is_file():
            itdt = {}
            for line in open(metapath, encoding="utf8"):
                idx, text = line.strip().split("\t")
                itdt[idx] = text
            self.itdt = itdt
        else:
            self.itdt = {}

        self.low_mem = low_mem
        self.utterances = set()
        self.current_generated = (None, None, None, None)  # speaker_name, spec, breaks, wav

        self.synthesizer = None  # type: Synthesizer

        # Initialize the events and the interface
        self.ui = UI()
        self.reset_ui(enc_models_dir, syn_models_dir, voc_models_dir)
        self.setup_events()
        self.ui.start()

    def make_out_dirs(self):
        self._out_dir.mkdir(exist_ok=True)

        self._out_mel_dir = self._out_dir.joinpath('mels')
        self._out_mel_dir.mkdir(exist_ok=True)

        self._out_wav_dir = self._out_dir.joinpath('wavs')
        self._out_wav_dir.mkdir(exist_ok=True)

        self._out_embed_dir = self._out_dir.joinpath('embeds')
        self._out_embed_dir.mkdir(exist_ok=True)

        self._out_record_dir = self._out_dir.joinpath('records')
        self._out_record_dir.mkdir(exist_ok=True)

    def excepthook(self, exc_type, exc_value, exc_tb):
        traceback.print_exception(exc_type, exc_value, exc_tb)
        self.ui.log("Exception: %s" % exc_value)

    def setup_events(self):
        # Dataset, speaker and utterance selection
        self.ui.browser_load_button.clicked.connect(lambda: self.load_from_browser())
        random_func = lambda level: lambda: self.ui.populate_browser(self.datasets_root,
                                                                     self.datasets,
                                                                     level)
        text_func = lambda: self.ui.text_prompt.setPlainText(np.random.choice(total_texts))
        self.ui.random_dataset_button.clicked.connect(text_func)
        self.ui.random_speaker_button.clicked.connect(random_func(1))
        self.ui.random_utterance_button.clicked.connect(random_func(2))
        self.ui.dataset_box.currentIndexChanged.connect(random_func(1))
        self.ui.speaker_box.currentIndexChanged.connect(random_func(2))

        # Model selection
        self.ui.encoder_box.currentIndexChanged.connect(self.init_encoder)

        def func():
            self.synthesizer = None

        self.ui.synthesizer_box.currentIndexChanged.connect(func)
        self.ui.vocoder_box.currentIndexChanged.connect(self.init_vocoder)

        # Utterance selection
        func = lambda: self.load_from_browser(self.ui.browse_file())
        self.ui.browser_browse_button.clicked.connect(func)
        func = lambda: self.ui.draw_utterance(self.ui.selected_utterance, "current")
        self.ui.utterance_history.currentIndexChanged.connect(func)
        func = lambda: self.ui.play(self.ui.selected_utterance.wav, Synthesizer.sample_rate)
        self.ui.play_button.clicked.connect(func)
        self.ui.stop_button.clicked.connect(self.ui.stop)
        self.ui.record_button.clicked.connect(self.record)
        self.ui.take_generated_button.clicked.connect(self.preprocess)

        # Generation
        func = lambda: self.synthesize() or self.vocode()
        self.ui.generate_button.clicked.connect(func)
        self.ui.compare_button.clicked.connect(self.compare)
        self.ui.synthesize_button.clicked.connect(self.synthesize)
        self.ui.vocode_button.clicked.connect(self.vocode)

        # UMAP legend
        self.ui.clear_button.clicked.connect(self.clear_utterances)

    def reset_ui(self, encoder_models_dir, synthesizer_models_dir, vocoder_models_dir):
        self.ui.populate_browser(self.datasets_root, self.datasets, 0, True)
        self.ui.populate_models(encoder_models_dir, synthesizer_models_dir, vocoder_models_dir)

    def load_from_browser(self, fpath=None):
        if fpath is None:
            fpath = Path(self.datasets_root,
                         self.ui.current_dataset_name,
                         self.ui.current_speaker_name,
                         self.ui.current_utterance_name)
            # name = '/'.join(fpath.relative_to(self.datasets_root).parts)
            dat = self.ui.current_dataset_name.replace("\\", "#").replace("/", "#")
            spk = self.ui.current_speaker_name.replace("\\", "#").replace("/", "#")
            aud = self.ui.current_utterance_name.replace("\\", "#").replace("/", "#")
            speaker_name = "#".join((dat, spk))
            name = "#".join((speaker_name, aud))
            # name = '-'.join(fpath.relative_to(self.datasets_root.joinpath(self.ui.current_dataset_name)).parts)
            # speaker_name = self.ui.current_speaker_name.replace("\\", "-").replace("/", "-")
            # Select the next utterance
            if self.ui.auto_next_checkbox.isChecked():
                self.ui.browser_select_next()
        elif fpath == "":
            return
        else:
            name = fpath.name
            speaker_name = fpath.parent.name

        # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
        # playback, so as to have a fair comparison with the generated audio
        wav = Synthesizer.load_preprocess_wav(fpath)
        self.ui.log("Loaded %s" % name)

        self.add_real_utterance(wav, name, speaker_name)

    def compare(self):
        """
        1.判断参考音频是否有对应文本。
        2.输入框更新为参考文本。
        3.合成参考音频对应文本的语音。
        4.展示embed,spectrogram,alignment。
        :return:
        """
        idx = self.ui.selected_utterance.name.replace("#", "/")
        idx = re.sub(r"(_preprocessed)(\..*?)$", r"\2", idx)
        if idx not in self.itdt:
            print("Compare Failed! index: {}".format(idx))
            return

        self.ui.text_prompt.setPlainText(self.itdt[idx])
        self.synthesize()
        self.vocode()

    def preprocess(self):
        wav = self.ui.selected_utterance.wav
        out = aukit.remove_noise(wav, sr=Synthesizer.sample_rate)
        hp = aukit.Dict2Obj({})
        hp["vad_window_length"] = 10  # milliseconds
        hp["vad_moving_average_width"] = 2
        hp["vad_max_silence_length"] = 2
        hp["audio_norm_target_dBFS"] = -32
        hp["sample_rate"] = 16000
        hp["int16_max"] = (2 ** 15) - 1
        out = trim_long_silences(out, hparams=hp)

        spec = Synthesizer.make_spectrogram(out)
        self.ui.draw_align(spec[::-1], "current")

        name = filename_add_suffix(self.ui.selected_utterance.name, "_preprocessed")
        speaker_name = self.ui.selected_utterance.speaker_name
        self.add_real_utterance(out, name, speaker_name)

    def record(self):
        wav = self.ui.record_one(encoder.sampling_rate, 5)
        if wav is None:
            return

        self.ui.play(wav, encoder.sampling_rate)

        speaker_name = "user01"
        name = speaker_name + "_rec_{}".format(time_formatter())
        fpath = self._out_record_dir.joinpath(name + '.wav')
        audio.save_wav(wav, fpath, encoder.sampling_rate)  # save
        wav = Synthesizer.load_preprocess_wav(fpath)  # 保持一致的数据格式

        self.add_real_utterance(wav, name, speaker_name)

    def add_real_utterance(self, wav, name, speaker_name):
        # Compute the mel spectrogram
        spec = Synthesizer.make_spectrogram(wav)
        self.ui.draw_spec(spec, "current")

        # Compute the embedding
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)

        np.save(self._out_embed_dir.joinpath(name + '.npy'), embed, allow_pickle=False)  # save

        # Add the utterance
        utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, False)
        self.utterances.add(utterance)
        self.ui.register_utterance(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "current")
        self.ui.draw_umap_projections(self.utterances)

    def clear_utterances(self):
        self.utterances.clear()
        self.ui.draw_umap_projections(self.utterances)

    def synthesize(self):
        self.ui.log("Generating the mel spectrogram...")
        self.ui.set_loading(1)

        # Synthesize the spectrogram
        if self.synthesizer is None:
            model_dir = Path(self.ui.current_synthesizer_model_dir)
            checkpoints_dir = model_dir.joinpath("checkpoints")
            hp_path = model_dir.joinpath("metas", "hparams.json")    # load from trained models
            if hp_path.exists():
                hparams = aukit.Dict2Obj(json.load(open(hp_path, encoding="utf8")))
            else:
                hparams = None
            self.synthesizer = Synthesizer(checkpoints_dir, low_mem=self.low_mem, hparams=hparams)
        if not self.synthesizer.is_loaded():
            self.ui.log("Loading the synthesizer %s" % self.synthesizer.checkpoint_fpath)

        ptext = self.ui.text_prompt.toPlainText()
        texts = ptext.split("\n")

        embed = self.ui.selected_utterance.embed
        embeds = np.stack([embed] * len(texts))
        specs, aligns = self.synthesizer.synthesize_spectrograms(texts, embeds, return_alignments=True)

        breaks = [spec.shape[1] for spec in specs]
        spec = np.concatenate(specs, axis=1)
        align = np.concatenate(aligns, axis=1)

        fref = self.ui.selected_utterance.name
        ftext = '。'.join(texts)
        ftime = '{}'.format(time_formatter())
        fname = filename_formatter('{}_{}_{}zi_{}.npy'.format(fref, ftime, len(ftext), ftext))
        np.save(self._out_mel_dir.joinpath(fname), spec, allow_pickle=False)  # save

        self.ui.draw_spec(spec, "generated")
        self.ui.draw_align(align, "generated")
        self.current_generated = (self.ui.selected_utterance.speaker_name, spec, breaks, None)
        self.ui.set_loading(0)

    def vocode(self):
        speaker_name, spec, breaks, _ = self.current_generated
        assert spec is not None

        # Synthesize the waveform
        if not vocoder.is_loaded():
            self.init_vocoder()

        def vocoder_progress(i, seq_len, b_size, gen_rate):
            real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
            line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
                   % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
            self.ui.log(line, "overwrite")
            self.ui.set_loading(i, seq_len)

        wav = None
        vocname = ""
        if self.ui.current_vocoder_fpath is not None:
            model_fpath = self.ui.current_vocoder_fpath
            vocname = Path(model_fpath).parent.stem
            if Path(model_fpath).parent.stem == "melgan":
                self.ui.log("Waveform generation with MelGAN... ")
                wav = vocoder_melgan.infer_waveform_melgan(spec, model_fpath)

            elif Path(model_fpath).parent.stem == "wavernn":
                self.ui.log("Waveform generation with WaveRNN... ")
                wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress)

        if wav is None:
            vocname = "griffinlim"
            self.ui.log("Waveform generation with Griffin-Lim... ")
            wav = Synthesizer.griffin_lim(spec)
        self.ui.set_loading(0)
        self.ui.log(" Done!", "append")

        # Play it
        wav = wav / np.abs(wav).max() * 0.97
        self.ui.play(wav, Synthesizer.sample_rate)

        fref = self.ui.selected_utterance.name
        ftime = '{}'.format(time_formatter())
        ftext = self.ui.text_prompt.toPlainText()
        fms = int(len(wav) * 1000 / Synthesizer.sample_rate)
        fvoc = vocname
        fname = filename_formatter('{}_{}_{}_{}ms_{}.wav'.format(fref, ftime, fvoc, fms, ftext))
        audio.save_wav(wav, self._out_wav_dir.joinpath(fname), Synthesizer.sample_rate)  # save

        # Compute the embedding
        # TODO: this is problematic with different sampling rates, gotta fix it
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)

        # Add the utterance
        name = speaker_name + "_gen_{}".format(time_formatter())
        utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True)

        np.save(self._out_embed_dir.joinpath(name + '.npy'), embed, allow_pickle=False)  # save

        self.utterances.add(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "generated")
        self.ui.draw_umap_projections(self.utterances)

    def init_encoder(self):
        model_fpath = self.ui.current_encoder_fpath

        self.ui.log("Loading the encoder %s... " % model_fpath)
        self.ui.set_loading(1)
        start = timer()
        encoder.load_model(model_fpath)
        self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
        self.ui.set_loading(0)

    def init_vocoder(self):
        model_fpath = self.ui.current_vocoder_fpath
        # Case of Griffin-lim
        if model_fpath is None:
            return
        else:
            self.ui.log("Loading the vocoder %s... " % model_fpath)
            self.ui.set_loading(1)
            start = timer()
            if Path(model_fpath).parent.stem == "melgan":
                vocoder_melgan.load_vocoder_melgan(model_fpath)
            elif Path(model_fpath).parent.stem == "wavernn":
                vocoder.load_model(model_fpath)
            else:
                return
            self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
            self.ui.set_loading(0)
Beispiel #20
0
def main():
    wav_list = []
    for filename in os.listdir(cross_sample_dir):
        # print(filename)
        wav_list.append(filename)
    wav_list.sort()
    print(wav_list)
    # return

    # if args.cpu:
    #     # Hide GPUs from Pytorch to force CPU processing
    #     os.environ["CUDA_VISIBLE_DEVICES"] = ""

    device_id = torch.cuda.current_device()
    gpu_properties = torch.cuda.get_device_properties(device_id)

    ## Load the models one by one.
    print("Preparing the encoder, the synthesizer and the vocoder...")
    # encoder.load_model(enc_model_fpath)
    synthesizer = Synthesizer(syn_model_fpath)
    # vocoder.load_model(voc_model_fpath)

    for i, wav_name in enumerate(wav_list):
        print('first:', wav_name)
        assert wav_name.split('.')[-1] == 'wav'
        for j, text in enumerate(input_text):
            # speaker embedding
            wav_path = os.path.join(cross_sample_dir, wav_name)

            GE2E_npy = 'spk-' + wav_name.split('.')[0] + '-GE2E.npy'
            GE2E_abs_path = os.path.join(cross_sample_dir, GE2E_npy)
            print(GE2E_abs_path)
            # return
            embed_utterance(
                (wav_path, GE2E_abs_path),
                encoder_model_fpath=Path('encoder/saved_models/pretrained.pt'))
            embed = np.load(GE2E_abs_path)
            print("Created the embedding")

            print(embed.shape)

            # return

            # mel_reference_path = 'mel-' + wav_name.split('.')[0] + '-mel.npy'
            # mel_reference_abs_path = os.path(cross_sample_dir, mel_reference_path)
            mel_reference, _linear_spectrogram, _out = wav2mel(wav_path)
            # mel_reference = mel_reference
            print(mel_reference.shape)
            # return

            # # If seed is specified, reset torch seed and force synthesizer reload
            # if args.seed is not None:
            #     torch.manual_seed(args.seed)
            #     synthesizer = Synthesizer(args.syn_model_fpath)

            # The synthesizer works in batch, so you need to put your data in a list or numpy array
            texts = [text]
            embeds = [embed]
            # If you know what the attention layer alignments are, you can retrieve them here by
            # passing return_alignments=True
            mels = synthesizer.synthesize_spectrograms(texts, embeds)
            mel = mels[0]
            mel = mel.T

            print('reference:', mel_reference.shape)

            print("Created the mel spectrogram")

            os.makedirs('log_Kiss_GE2E_SayEN_syn_wavs_Cross', exist_ok=True)
            _wav_pre = mel2wav(
                mel,
                wav_name_path=os.path.join(
                    'log_Kiss_GE2E_SayEN_syn_wavs_Cross',
                    'spk_' + str(i) + '_' + str(j) + '_pre.wav'))
            _wav_target = mel2wav(
                mel_reference,
                wav_name_path=os.path.join(
                    'log_Kiss_GE2E_SayEN_syn_wavs_Cross',
                    'spk_' + str(i) + '_' + str(j) + '_reference.wav'))
Beispiel #21
0
class Toolbox:
    def __init__(self, datasets_root, enc_models_dir, syn_models_dir, voc_models_dir, low_mem, seed, no_mp3_support):
        if not no_mp3_support:
            try:
                librosa.load("samples/6829_00000.mp3")
            except NoBackendError:
                print("Librosa will be unable to open mp3 files if additional software is not installed.\n"
                  "Please install ffmpeg or add the '--no_mp3_support' option to proceed without support for mp3 files.")
                exit(-1)
        self.no_mp3_support = no_mp3_support
        sys.excepthook = self.excepthook
        self.datasets_root = datasets_root
        self.low_mem = low_mem
        self.utterances = set()
        self.current_generated = (None, None, None, None) # speaker_name, spec, breaks, wav
        
        self.synthesizer = None # type: Synthesizer
        self.current_wav = None
        self.waves_list = []
        self.waves_count = 0
        self.waves_namelist = []

        # Check for webrtcvad (enables removal of silences in vocoder output)
        try:
            import webrtcvad
            self.trim_silences = True
        except:
            self.trim_silences = False

        # Initialize the events and the interface
        self.ui = UI()
        self.reset_ui(enc_models_dir, syn_models_dir, voc_models_dir, seed)
        self.setup_events()
        self.ui.start()

    def excepthook(self, exc_type, exc_value, exc_tb):
        traceback.print_exception(exc_type, exc_value, exc_tb)
        self.ui.log("Exception: %s" % exc_value)
        
    def setup_events(self):
        # Dataset, speaker and utterance selection
        self.ui.browser_load_button.clicked.connect(lambda: self.load_from_browser())
        random_func = lambda level: lambda: self.ui.populate_browser(self.datasets_root,
                                                                     recognized_datasets,
                                                                     level)
        self.ui.random_dataset_button.clicked.connect(random_func(0))
        self.ui.random_speaker_button.clicked.connect(random_func(1))
        self.ui.random_utterance_button.clicked.connect(random_func(2))
        self.ui.dataset_box.currentIndexChanged.connect(random_func(1))
        self.ui.speaker_box.currentIndexChanged.connect(random_func(2))
        
        # Model selection
        self.ui.encoder_box.currentIndexChanged.connect(self.init_encoder)
        def func(): 
            self.synthesizer = None
        self.ui.synthesizer_box.currentIndexChanged.connect(func)
        self.ui.vocoder_box.currentIndexChanged.connect(self.init_vocoder)
        
        # Utterance selection
        func = lambda: self.load_from_browser(self.ui.browse_file())
        self.ui.browser_browse_button.clicked.connect(func)

        #Audio book
        func = lambda: self.load_book(self.ui.browse_book())
        self.ui.load_book.clicked.connect(func)

        func = lambda: self.synthBook()
        self.ui.synth_book.clicked.connect(func)

        #
        func = lambda: self.ui.draw_utterance(self.ui.selected_utterance, "current")
        self.ui.utterance_history.currentIndexChanged.connect(func)
        func = lambda: self.ui.play(self.ui.selected_utterance.wav, Synthesizer.sample_rate)
        self.ui.play_button.clicked.connect(func)
        self.ui.stop_button.clicked.connect(self.ui.stop)
        self.ui.record_button.clicked.connect(self.record)

        #Audio
        self.ui.setup_audio_devices(Synthesizer.sample_rate)

        #Wav playback & save
        func = lambda: self.replay_last_wav()
        self.ui.replay_wav_button.clicked.connect(func)
        func = lambda: self.export_current_wave()
        self.ui.export_wav_button.clicked.connect(func)
        self.ui.waves_cb.currentIndexChanged.connect(self.set_current_wav)

        # Generation
        func = lambda: self.synthesize() or self.vocode()
        self.ui.generate_button.clicked.connect(func)
        self.ui.synthesize_button.clicked.connect(self.synthesize)
        self.ui.vocode_button.clicked.connect(self.vocode)
        self.ui.random_seed_checkbox.clicked.connect(self.update_seed_textbox)

        # UMAP legend
        self.ui.clear_button.clicked.connect(self.clear_utterances)

    def set_current_wav(self, index):
        self.current_wav = self.waves_list[index]

    def export_current_wave(self):
        self.ui.save_audio_file(self.current_wav, Synthesizer.sample_rate)

    def replay_last_wav(self):
        self.ui.play(self.current_wav, Synthesizer.sample_rate)

    def reset_ui(self, encoder_models_dir, synthesizer_models_dir, vocoder_models_dir, seed):
        self.ui.populate_browser(self.datasets_root, recognized_datasets, 0, True)
        self.ui.populate_models(encoder_models_dir, synthesizer_models_dir, vocoder_models_dir)
        self.ui.populate_gen_options(seed, self.trim_silences)
        
    def load_from_browser(self, fpath=None):
        if fpath is None:
            fpath = Path(self.datasets_root,
                         self.ui.current_dataset_name,
                         self.ui.current_speaker_name,
                         self.ui.current_utterance_name)
            name = str(fpath.relative_to(self.datasets_root))
            speaker_name = self.ui.current_dataset_name + '_' + self.ui.current_speaker_name
            
            # Select the next utterance
            if self.ui.auto_next_checkbox.isChecked():
                self.ui.browser_select_next()
        elif fpath == "":
            return 
        else:
            name = fpath.name
            speaker_name = fpath.parent.name

        if fpath.suffix.lower() == ".mp3" and self.no_mp3_support:
                self.ui.log("Error: No mp3 file argument was passed but an mp3 file was used")
                return

        # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
        # playback, so as to have a fair comparison with the generated audio
        wav = Synthesizer.load_preprocess_wav(fpath)
        self.ui.log("Loaded %s" % name)

        self.add_real_utterance(wav, name, speaker_name)


    def load_book(self, fpath=None):
        if(fpath is None or fpath == ""):
            return
        if(fpath.suffix.lower() == ".txt"):
            self.ui.text_prompt.clear()
            file = open(fpath.absolute().as_posix(),mode='r')
            text = file.read()
            text = text.replace(".",".\n")
            text = [line for line in text.split('\n') if line.strip() != '']
            sep = '\n'
            text  = sep.join(text)
            #text = text.replace(",",",\n")
            #text = text.replace(":",":\n")
            #text = text.replace("-","-\n")
            #text = text.replace("!","!\n")
            #text = text.replace("?","?\n)
            file.close()
            self.ui.text_prompt.appendPlainText(text)

        else:
            self.ui.log("Format is not supported")
            return
    def synthBook(self):
        k = 0
        self.ui.log('Book synthesis start')

        texts = self.ui.text_prompt.toPlainText().split("\n")
        n = 2
        for i in range(0, len(texts), n):
            newList = texts[i:i + n]
            sep = '\n'
            newText  = sep.join(newList)
            #self.ui.log(newText)
            self.ui.text_prompt.clear()
            self.ui.text_prompt.appendPlainText(newText)

            self.synthesize()
            self.vocode()
            percent = str(((k+1)/(len(texts)/n)*100))
            self.ui.log("Done for " + percent +"%")
            sf.write("output/{}.wav".format(k), self.current_wav, Synthesizer.sample_rate)
            k+=1
    def record(self):
        wav = self.ui.record_one(encoder.sampling_rate, 5)
        if wav is None:
            return 
        self.ui.play(wav, encoder.sampling_rate)

        speaker_name = "user01"
        name = speaker_name + "_rec_%05d" % np.random.randint(100000)
        self.add_real_utterance(wav, name, speaker_name)
        
    def add_real_utterance(self, wav, name, speaker_name):
        # Compute the mel spectrogram
        spec = Synthesizer.make_spectrogram(wav)
        self.ui.draw_spec(spec, "current")

        # Compute the embedding
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)

        # Add the utterance
        utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, False)
        self.utterances.add(utterance)
        self.ui.register_utterance(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "current")
        self.ui.draw_umap_projections(self.utterances)
        
    def clear_utterances(self):
        self.utterances.clear()
        self.ui.draw_umap_projections(self.utterances)
        
    def synthesize(self):
        self.ui.log("Generating the mel spectrogram...")
        self.ui.set_loading(1)
        
        # Synthesize the spectrogram
        if self.synthesizer is None:
            model_dir = self.ui.current_synthesizer_model_dir
            checkpoints_dir = model_dir.joinpath("taco_pretrained")
            self.synthesizer = Synthesizer(checkpoints_dir, low_mem=self.low_mem)
        if not self.synthesizer.is_loaded():
            self.ui.log("Loading the synthesizer %s" % self.synthesizer.checkpoint_fpath)

        # Update the synthesizer random seed
        if self.ui.random_seed_checkbox.isChecked():
            seed = self.synthesizer.set_seed(int(self.ui.seed_textbox.text()))
            self.ui.populate_gen_options(seed, self.trim_silences)
        else:
            seed = self.synthesizer.set_seed(None)
        
        texts = self.ui.text_prompt.toPlainText().split("\n")
        
        embed = self.ui.selected_utterance.embed
        embeds = np.stack([embed] * len(texts))
        specs = self.synthesizer.synthesize_spectrograms(texts, embeds)
        breaks = [spec.shape[1] for spec in specs]
        spec = np.concatenate(specs, axis=1)
        
        self.ui.draw_spec(spec, "generated")
        self.current_generated = (self.ui.selected_utterance.speaker_name, spec, breaks, None)
        self.ui.set_loading(0)

    def vocode(self):
        speaker_name, spec, breaks, _ = self.current_generated
        assert spec is not None

        # Initialize the vocoder model and make it determinstic, if user provides a seed
        if self.ui.random_seed_checkbox.isChecked():
            seed = self.synthesizer.set_seed(int(self.ui.seed_textbox.text()))
            self.ui.populate_gen_options(seed, self.trim_silences)
        else:
            seed = None

        if seed is not None:
            torch.manual_seed(seed)

        # Synthesize the waveform
        if not vocoder.is_loaded() or seed is not None:
            self.init_vocoder()

        def vocoder_progress(i, seq_len, b_size, gen_rate):
            real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
            line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
                   % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
            self.ui.log(line, "overwrite")
            self.ui.set_loading(i, seq_len)
        if self.ui.current_vocoder_fpath is not None:
            self.ui.log("")
            wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress)
        else:
            self.ui.log("Waveform generation with Griffin-Lim... ")
            wav = Synthesizer.griffin_lim(spec)
        self.ui.set_loading(0)
        self.ui.log(" Done!", "append")
        
        # Add breaks
        b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
        b_starts = np.concatenate(([0], b_ends[:-1]))
        wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
        breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
        wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])

        # Trim excessive silences
        if self.ui.trim_silences_checkbox.isChecked():
            wav = encoder.preprocess_wav(wav)

        # Play it
        wav = wav / np.abs(wav).max() * 0.97
        self.ui.play(wav, Synthesizer.sample_rate)

        # Name it (history displayed in combobox)
        # TODO better naming for the combobox items?
        wav_name = str(self.waves_count + 1)

        #Update waves combobox
        self.waves_count += 1
        if self.waves_count > MAX_WAVES:
          self.waves_list.pop()
          self.waves_namelist.pop()
        self.waves_list.insert(0, wav)
        self.waves_namelist.insert(0, wav_name)

        self.ui.waves_cb.disconnect()
        self.ui.waves_cb_model.setStringList(self.waves_namelist)
        self.ui.waves_cb.setCurrentIndex(0)
        self.ui.waves_cb.currentIndexChanged.connect(self.set_current_wav)

        # Update current wav
        self.set_current_wav(0)
        
        #Enable replay and save buttons:
        self.ui.replay_wav_button.setDisabled(False)
        self.ui.export_wav_button.setDisabled(False)

        # Compute the embedding
        # TODO: this is problematic with different sampling rates, gotta fix it
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
        
        # Add the utterance
        name = speaker_name + "_gen_%05d" % np.random.randint(100000)
        utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True)
        self.utterances.add(utterance)
        
        # Plot it
        self.ui.draw_embed(embed, name, "generated")
        self.ui.draw_umap_projections(self.utterances)
        
    def init_encoder(self):
        model_fpath = self.ui.current_encoder_fpath
        
        self.ui.log("Loading the encoder %s... " % model_fpath)
        self.ui.set_loading(1)
        start = timer()
        encoder.load_model(model_fpath)
        self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
        self.ui.set_loading(0)
           
    def init_vocoder(self):
        model_fpath = self.ui.current_vocoder_fpath
        # Case of Griffin-lim
        if model_fpath is None:
            return 
    
        self.ui.log("Loading the vocoder %s... " % model_fpath)
        self.ui.set_loading(1)
        start = timer()
        vocoder.load_model(model_fpath)
        self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
        self.ui.set_loading(0)

    def update_seed_textbox(self):
       self.ui.update_seed_textbox() 
def run_voice_cloning():
    ## Model locations
    enc_model_fpath = Path("encoder/saved_models/pretrained.pt")
    syn_model_dir = Path("synthesizer/saved_models/logs-pretrained/")
    voc_model_fpath = Path("vocoder/saved_models/pretrained/pretrained.pt")
    ref_voice_path = request.json["voiceFile"]  # filename like ojo3.wav
    messages = request.json["messages"]  # array of strings
    low_mem = request.json[
        "low_mem"] if "low_mem" in request.json else False  # whether to use LowMem Mode

    # Base64 encode the parameters so that we can reference this job in later api calls
    dataToEncodeAsID = ','.join(messages) + ref_voice_path
    encodedBytes = base64.b64encode(dataToEncodeAsID.encode("utf-8"))
    req_id = str(encodedBytes, "utf-8")
    # Md5 Hash it so that it is a consistent length
    req_id = hashlib.md5(req_id.encode('utf-8')).hexdigest()

    # Clear destination folder of generated sound files
    output_path = "/output/%s/" % req_id
    if os.path.exists(output_path):
        shutil.rmtree(output_path)
    os.makedirs(output_path)

    ## Print some environment information (for debugging purposes)
    print("Running a test of your configuration...\n")
    if not torch.cuda.is_available():
        print(
            "Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
            "for deep learning, ensure that the drivers are properly installed, and that your "
            "CUDA version matches your PyTorch installation. CPU-only inference is currently "
            "not supported.",
            file=sys.stderr)
        return abort(500)
    device_id = torch.cuda.current_device()
    gpu_properties = torch.cuda.get_device_properties(device_id)
    print(
        "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
        "%.1fGb total memory.\n" %
        (torch.cuda.device_count(), device_id, gpu_properties.name,
         gpu_properties.major, gpu_properties.minor,
         gpu_properties.total_memory / 1e9))

    ## Load the models one by one.
    print("Preparing the encoder, the synthesizer and the vocoder...")
    encoder.load_model(enc_model_fpath)
    synthesizer = Synthesizer(syn_model_dir.joinpath("taco_pretrained"),
                              low_mem=low_mem)
    vocoder.load_model(voc_model_fpath)

    in_fpath = Path(ref_voice_path)

    print("Computing the embedding")
    ## Computing the embedding
    # First, we load the wav using the function that the speaker encoder provides. This is
    # important: there is preprocessing that must be applied.

    # The following two methods are equivalent:
    # - Directly load from the filepath:
    preprocessed_wav = encoder.preprocess_wav(in_fpath)
    # - If the wav is already loaded:
    original_wav, sampling_rate = librosa.load(in_fpath)
    preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
    print("Loaded file succesfully")

    # Then we derive the embedding. There are many functions and parameters that the
    # speaker encoder interfaces. These are mostly for in-depth research. You will typically
    # only use this function (with its default parameters):
    embed = encoder.embed_utterance(preprocessed_wav)
    print("Created the embedding")

    print("Generation loop")
    num_generated = 0
    fpath = None
    for text in messages:
        try:
            ## Generating the spectrogram
            # The synthesizer works in batch, so you need to put your data in a list or numpy array
            texts = [text]
            embeds = [embed]
            # If you know what the attention layer alignments are, you can retrieve them here by
            # passing return_alignments=True
            specs = synthesizer.synthesize_spectrograms(texts, embeds)
            spec = specs[0]
            print("Created the mel spectrogram")

            ## Generating the waveform
            print("Synthesizing the waveform:")
            # Synthesizing the waveform is fairly straightforward. Remember that the longer the
            # spectrogram, the more time-efficient the vocoder.
            generated_wav = vocoder.infer_waveform(spec)

            ## Post-generation
            # There's a bug with sounddevice that makes the audio cut one second earlier, so we
            # pad it.
            generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                                   mode="constant")

            # Save it on the disk
            fpath = output_path + ("output_%03d.wav" % num_generated)
            print(generated_wav.dtype)
            librosa.output.write_wav(fpath, generated_wav.astype(np.float32),
                                     synthesizer.sample_rate)
            num_generated += 1
            print("\nSaved output as %s\n\n" % fpath)

        except Exception as e:
            print("Caught exception: %s" % repr(e))
            print("Restarting\n")

    return req_id
def clone(audio=None, audio_url=None, sentence=""):
    try:
        if not 10 <= len(sentence.split(" ")) <= 30:
            return {"error": "Sentence is invalid! (length must be 10 to 30 words)"}
        audio_data = audio
        if audio_url:
            # Link
            if "http://" in audio_url or "https://" in audio_url:
                header = {'User-Agent': 'Mozilla/5.0 (Windows NT x.y; Win64; x64; rv:9.0) Gecko/20100101 Firefox/10.0'}
                # Check if audio file has less than 5Mb
                r = requests.head(audio_url, headers=header, allow_redirects=True)
                size = r.headers.get('content-length', 0)
                size = int(size) / float(1 << 20)
                log.info("File size: {:.2f} Mb".format(size))
                if size > 10:
                    return {"error": "Input audio file is too large! (max 10Mb)"}
                r = requests.get(audio_url, headers=header, allow_redirects=True)
                audio_data = r.content
            # Base64
            elif len(audio_url) > 500:
                audio_data = base64.b64decode(audio_url)

        audio_path = generate_uid() + ".audio"
        with open(audio_path, "wb") as f:
            f.write(audio_data)

        # Load the models one by one.
        log.info("Preparing the encoder, the synthesizer and the vocoder...")
        encoder.load_model(Path("rtvc/encoder/saved_models/pretrained.pt"))
        synthesizer = Synthesizer(Path("rtvc/synthesizer/saved_models/logs-pretrained/taco_pretrained"))
        vocoder.load_model(Path("rtvc/vocoder/saved_models/pretrained/pretrained.pt"))

        # Computing the embedding
        original_wav, sampling_rate = librosa.load(audio_path)
        preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
        log.info("Loaded file successfully")

        if os.path.exists(audio_path):
            os.remove(audio_path)

        embed = encoder.embed_utterance(preprocessed_wav)
        log.info("Created the embedding")

        specs = synthesizer.synthesize_spectrograms([sentence], [embed])
        spec = np.concatenate(specs, axis=1)
        # spec = specs[0]
        log.info("Created the mel spectrogram")

        # Generating the waveform
        log.info("Synthesizing the waveform:")
        generated_wav = vocoder.infer_waveform(spec, progress_callback=lambda *args: None)

        # Post-generation
        # There's a bug with sounddevice that makes the audio cut one second earlier, so we
        # pad it.
        generated_wav = np.pad(generated_wav,
                               (0, synthesizer.sample_rate),
                               mode="constant")

        # Save it on the disk
        fp = tempfile.TemporaryFile()
        librosa.output.write_wav(fp, generated_wav.astype(np.float32), synthesizer.sample_rate)
        return {"audio": fp.read()}

    except Exception as e:
        log.error(e)
        traceback.print_exc()
        return {"error": "Fail"}
Beispiel #24
0
def voice_cloning(audio_file, text, enc_model_fpath, syn_model_dir,
                  voc_model_fpath, low_mem):
    ## Print some environment information (for debugging purposes)
    print("Running a test of your configuration...\n")
    if not torch.cuda.is_available():
        print(
            "Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
            "for deep learning, ensure that the drivers are properly installed, and that your "
            "CUDA version matches your PyTorch installation. CPU-only inference is currently "
            "not supported.",
            file=sys.stderr)
        quit(-1)
    device_id = torch.cuda.current_device()
    gpu_properties = torch.cuda.get_device_properties(device_id)
    print(
        "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
        "%.1fGb total memory.\n" %
        (torch.cuda.device_count(), device_id, gpu_properties.name,
         gpu_properties.major, gpu_properties.minor,
         gpu_properties.total_memory / 1e9))

    ## Load the models one by one.
    print("Preparing the encoder, the synthesizer and the vocoder...")
    encoder.load_model(enc_model_fpath)
    synthesizer = Synthesizer(syn_model_dir.joinpath("taco_pretrained"),
                              low_mem=low_mem)
    vocoder.load_model(voc_model_fpath)

    ## Run a test
    print("Testing your configuration with small inputs.")
    # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
    # sampling rate, which may differ.
    # If you're unfamiliar with digital audio, know that it is encoded as an array of floats
    # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
    # The sampling rate is the number of values (samples) recorded per second, it is set to
    # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
    # to an audio of 1 second.
    print("\tTesting the encoder...")
    encoder.embed_utterance(np.zeros(encoder.sampling_rate))

    # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
    # returns, but here we're going to make one ourselves just for the sake of showing that it's
    # possible.
    embed = np.random.rand(speaker_embedding_size)
    # Embeddings are L2-normalized (this isn't important here, but if you want to make your own
    # embeddings it will be).
    embed /= np.linalg.norm(embed)
    # The synthesizer can handle multiple inputs with batching. Let's create another embedding to
    # illustrate that
    embeds = [embed, np.zeros(speaker_embedding_size)]
    texts = ["test 1", "test 2"]
    print(
        "\tTesting the synthesizer... (loading the model will output a lot of text)"
    )
    mels = synthesizer.synthesize_spectrograms(texts, embeds)

    # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
    # can concatenate the mel spectrograms to a single one.
    mel = np.concatenate(mels, axis=1)
    # The vocoder can take a callback function to display the generation. More on that later. For
    # now we'll simply hide it like this:
    no_action = lambda *args: None
    print("\tTesting the vocoder...")
    # For the sake of making this test short, we'll pass a short target length. The target length
    # is the length of the wav segments that are processed in parallel. E.g. for audio sampled
    # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
    # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
    # that has a detrimental effect on the quality of the audio. The default parameters are
    # recommended in general.
    vocoder.infer_waveform(mel,
                           target=200,
                           overlap=50,
                           progress_callback=no_action)

    print("All test passed! You can now synthesize speech.\n\n")

    ## Interactive speech generation
    print(
        "This is a GUI-less example of interface to SV2TTS. The purpose of this script is to "
        "show how you can interface this project easily with your own. See the source code for "
        "an explanation of what is happening.\n")

    print("Interactive generation loop")
    num_generated = 0
    while True:
        try:
            # Get the reference audio filepath
            # message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \
            #           "wav, m4a, flac, ...):\n"
            # in_fpath = Path(input(message).replace("\"", "").replace("\'", ""))
            in_fpath = Path(audio_file.replace("\"", "").replace("\'", ""))

            ## Computing the embedding
            # First, we load the wav using the function that the speaker encoder provides. This is
            # important: there is preprocessing that must be applied.

            # The following two methods are equivalent:
            # - Directly load from the filepath:
            preprocessed_wav = encoder.preprocess_wav(in_fpath)
            # - If the wav is already loaded:
            original_wav, sampling_rate = librosa.load(in_fpath)
            preprocessed_wav = encoder.preprocess_wav(original_wav,
                                                      sampling_rate)
            print("Loaded file succesfully")

            # Then we derive the embedding. There are many functions and parameters that the
            # speaker encoder interfaces. These are mostly for in-depth research. You will typically
            # only use this function (with its default parameters):
            embed = encoder.embed_utterance(preprocessed_wav)
            print("Created the embedding")

            ## Generating the spectrogram
            # text = input("Write a sentence (+-20 words) to be synthesized:\n")
            print('\n\nThe text to convert to speech: ', text)
            text = text

            # The synthesizer works in batch, so you need to put your data in a list or numpy array
            texts = [text]
            embeds = [embed]
            # If you know what the attention layer alignments are, you can retrieve them here by
            # passing return_alignments=True
            specs = synthesizer.synthesize_spectrograms(texts, embeds)
            spec = specs[0]
            print("Created the mel spectrogram")

            ## Generating the waveform
            print("Synthesizing the waveform:")
            # Synthesizing the waveform is fairly straightforward. Remember that the longer the
            # spectrogram, the more time-efficient the vocoder.
            generated_wav = vocoder.infer_waveform(spec)

            ## Post-generation
            # There's a bug with sounddevice that makes the audio cut one second earlier, so we
            # pad it.
            generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                                   mode="constant")

            # # Play the audio (non-blocking)
            # if not args.no_sound:
            #     sd.stop()
            #     sd.play(generated_wav, synthesizer.sample_rate)

            # Save it on the disk
            fpath = "demo_output_%02d.wav" % num_generated
            print(generated_wav.dtype)
            librosa.output.write_wav(fpath, generated_wav.astype(np.float32),
                                     synthesizer.sample_rate)
            num_generated += 1
            print("\nSaved output as %s\n\n" % fpath)

        except Exception as e:
            print("Caught exception: %s" % repr(e))
            print("Restarting\n")
Beispiel #25
0
class Toolbox:
    def __init__(self, datasets_root, enc_models_dir, syn_models_dir,
                 voc_models_dir, low_mem):
        sys.excepthook = self.excepthook
        self.datasets_root = datasets_root
        self.low_mem = low_mem
        self.utterances = set()
        self.current_generated = (None, None, None, None
                                  )  # speaker_name, spec, breaks, wav

        self.synthesizer = None  # type: Synthesizer

        # Initialize the events and the interface
        self.ui = UI()
        self.reset_ui(enc_models_dir, syn_models_dir, voc_models_dir)
        self.setup_events()
        self.ui.start()

    def excepthook(self, exc_type, exc_value, exc_tb):
        traceback.print_exception(exc_type, exc_value, exc_tb)
        self.ui.log("Exception: %s" % exc_value)

    def setup_events(self):
        # Dataset, speaker and utterance selection
        self.ui.browser_load_button.clicked.connect(
            lambda: self.load_from_browser())
        random_func = lambda level: lambda: self.ui.populate_browser(
            self.datasets_root, recognized_datasets, level)
        self.ui.random_dataset_button.clicked.connect(random_func(0))
        self.ui.random_speaker_button.clicked.connect(random_func(1))
        self.ui.random_utterance_button.clicked.connect(random_func(2))
        self.ui.dataset_box.currentIndexChanged.connect(random_func(1))
        self.ui.speaker_box.currentIndexChanged.connect(random_func(2))

        # Model selection
        self.ui.encoder_box.currentIndexChanged.connect(self.init_encoder)

        def func():
            self.synthesizer = None

        self.ui.synthesizer_box.currentIndexChanged.connect(func)
        self.ui.vocoder_box.currentIndexChanged.connect(self.init_vocoder)

        # Utterance selection
        func = lambda: self.load_from_browser(self.ui.browse_file())
        self.ui.browser_browse_button.clicked.connect(func)
        func = lambda: self.ui.draw_utterance(self.ui.selected_utterance,
                                              "current")
        self.ui.utterance_history.currentIndexChanged.connect(func)
        func = lambda: self.ui.play(self.ui.selected_utterance.wav, Synthesizer
                                    .sample_rate)
        self.ui.play_button.clicked.connect(func)
        self.ui.stop_button.clicked.connect(self.ui.stop)
        self.ui.record_button.clicked.connect(self.record)

        # Generation
        func = lambda: self.synthesize() or self.vocode()
        self.ui.generate_button.clicked.connect(func)
        self.ui.synthesize_button.clicked.connect(self.synthesize)
        self.ui.vocode_button.clicked.connect(self.vocode)

        # UMAP legend
        self.ui.clear_button.clicked.connect(self.clear_utterances)

    def reset_ui(self, encoder_models_dir, synthesizer_models_dir,
                 vocoder_models_dir):
        self.ui.populate_browser(self.datasets_root, recognized_datasets, 0,
                                 True)
        self.ui.populate_models(encoder_models_dir, synthesizer_models_dir,
                                vocoder_models_dir)

    def load_from_browser(self, fpath=None):
        if fpath is None:
            fpath = Path(self.datasets_root, self.ui.current_dataset_name,
                         self.ui.current_speaker_name,
                         self.ui.current_utterance_name)
            name = str(fpath.relative_to(self.datasets_root))
            speaker_name = self.ui.current_dataset_name + '_' + self.ui.current_speaker_name

            # Select the next utterance
            if self.ui.auto_next_checkbox.isChecked():
                self.ui.browser_select_next()
        elif fpath == "":
            return
        else:
            name = fpath.name
            speaker_name = fpath.parent.name

        # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
        # playback, so as to have a fair comparison with the generated audio
        wav = Synthesizer.load_preprocess_wav(fpath)
        self.ui.log("Loaded %s" % name)

        self.add_real_utterance(wav, name, speaker_name)

    def record(self):
        wav = self.ui.record_one(encoder.sampling_rate, 5)
        if wav is None:
            return
        self.ui.play(wav, encoder.sampling_rate)

        speaker_name = "user01"
        name = speaker_name + "_rec_%05d" % np.random.randint(100000)
        self.add_real_utterance(wav, name, speaker_name)

    def add_real_utterance(self, wav, name, speaker_name):
        # Compute the mel spectrogram
        spec = Synthesizer.make_spectrogram(wav)
        self.ui.draw_spec(spec, "current")

        # Compute the embedding
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.load_preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(
            encoder_wav, return_partials=True)

        # Add the utterance
        utterance = Utterance(name, speaker_name, wav, spec, embed,
                              partial_embeds, False)
        self.utterances.add(utterance)
        self.ui.register_utterance(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "current")
        self.ui.draw_umap_projections(self.utterances)

    def clear_utterances(self):
        self.utterances.clear()
        self.ui.draw_umap_projections(self.utterances)

    def synthesize(self):
        self.ui.log("Generating the mel spectrogram...")
        self.ui.set_loading(1)

        # Synthesize the spectrogram
        if self.synthesizer is None:
            model_dir = self.ui.current_synthesizer_model_dir
            checkpoints_dir = model_dir.joinpath("taco_pretrained")
            self.synthesizer = Synthesizer(checkpoints_dir,
                                           low_mem=self.low_mem)
        if not self.synthesizer.is_loaded():
            self.ui.log("Loading the synthesizer %s" %
                        self.synthesizer.checkpoint_fpath)

        texts = self.ui.text_prompt.toPlainText().split("\n")
        embed = self.ui.selected_utterance.embed
        embeds = np.stack([embed] * len(texts))
        specs = self.synthesizer.synthesize_spectrograms(texts, embeds)
        breaks = [spec.shape[1] for spec in specs]
        spec = np.concatenate(specs, axis=1)

        self.ui.draw_spec(spec, "generated")
        self.current_generated = (self.ui.selected_utterance.speaker_name,
                                  spec, breaks, None)
        self.ui.set_loading(0)

    def vocode(self):
        speaker_name, spec, breaks, _ = self.current_generated
        assert spec is not None

        # Synthesize the waveform
        if not vocoder.is_loaded():
            self.init_vocoder()

        def vocoder_progress(i, seq_len, b_size, gen_rate):
            real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
            line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
                   % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
            self.ui.log(line, "overwrite")
            self.ui.set_loading(i, seq_len)

        if self.ui.current_vocoder_fpath is not None:
            self.ui.log("")
            wav = vocoder.infer_waveform(spec,
                                         progress_callback=vocoder_progress)
        else:
            self.ui.log("Waveform generation with Griffin-Lim... ")
            wav = Synthesizer.griffin_lim(spec)
        self.ui.set_loading(0)
        self.ui.log(" Done!", "append")

        # Add breaks
        b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
        b_starts = np.concatenate(([0], b_ends[:-1]))
        wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
        breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
        wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])

        # Play it
        wav = wav / np.abs(wav).max() * 0.97
        self.ui.play(wav, Synthesizer.sample_rate)

        # Compute the embedding
        # TODO: this is problematic with different sampling rates, gotta fix it
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.load_preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(
            encoder_wav, return_partials=True)

        # Add the utterance
        name = speaker_name + "_gen_%05d" % np.random.randint(100000)
        utterance = Utterance(name, speaker_name, wav, spec, embed,
                              partial_embeds, True)
        self.utterances.add(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "generated")
        self.ui.draw_umap_projections(self.utterances)

    def init_encoder(self):
        model_fpath = self.ui.current_encoder_fpath

        self.ui.log("Loading the encoder %s... " % model_fpath)
        self.ui.set_loading(1)
        start = timer()
        encoder.load_model(model_fpath)
        self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
        self.ui.set_loading(0)

    def init_vocoder(self):
        model_fpath = self.ui.current_vocoder_fpath
        # Case of Griffin-lim
        if model_fpath is None:
            return

        self.ui.log("Loading the vocoder %s... " % model_fpath)
        self.ui.set_loading(1)
        start = timer()
        vocoder.load_model(model_fpath)
        self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
        self.ui.set_loading(0)
Beispiel #26
0
class Text2Speech:
    def __init__(self):
        if (Text2SpeechModel == "dc_tts"):
            self.g = Graph(mode="synthesize")
            print("Text2Speech Tensorflow Graph loaded")
        elif (Text2SpeechModel == "RTVC"):
            enc_model_fpath = os.path.join(
                root_file_path, "RTVC", "encoder/saved_models/pretrained.pt")
            syn_model_dir = os.path.join(
                root_file_path, "RTVC",
                "synthesizer/saved_models/logs-pretrained")
            voc_model_fpath = os.path.join(
                root_file_path, "RTVC",
                "vocoder/saved_models/pretrained/pretrained.pt")
            encoder.load_model(enc_model_fpath)
            self.synthesizer = Synthesizer(os.path.join(
                syn_model_dir, "taco_pretrained"),
                                           low_mem=False)
            vocoder.load_model(voc_model_fpath)
            in_fpath = os.path.join("/",
                                    *root_file_path.split("/")[:-1],
                                    "REF/refaudioRTVC/ref.wav")
            preprocessed_wav = encoder.preprocess_wav(in_fpath)
            original_wav, sampling_rate = librosa.load(in_fpath)
            preprocessed_wav = encoder.preprocess_wav(original_wav,
                                                      sampling_rate)
            embed = encoder.embed_utterance(preprocessed_wav)
            self.embeds = [embed]
        elif (Text2SpeechModel == "AudioSynth"):
            taco_pretrained_config_path = os.path.join(
                root_file_path,
                'AudioSynth/TensorFlowTTS/examples/tacotron2/conf/tacotron2.v1.yaml'
            )
            tacotron2_config = AutoConfig.from_pretrained(
                taco_pretrained_config_path)
            taco_path = os.path.join(root_file_path,
                                     "AudioSynth/tacotron2-120k.h5")
            self.tacotron2 = TFAutoModel.from_pretrained(
                config=tacotron2_config,
                pretrained_path=taco_path,
                training=False,
                name="tacotron2")

            melgan_stft_pretrained_config_path = os.path.join(
                root_file_path,
                'AudioSynth/TensorFlowTTS/examples/melgan.stft/conf/melgan.stft.v1.yaml'
            )
            melgan_stft_config = AutoConfig.from_pretrained(
                melgan_stft_pretrained_config_path)
            melgan_stft_path = os.path.join(root_file_path,
                                            "AudioSynth/melgan.stft-2M.h5")
            self.melgan_stft = TFAutoModel.from_pretrained(
                config=melgan_stft_config,
                pretrained_path=melgan_stft_path,
                name="melgan_stft")
            self.processor = AutoProcessor.from_pretrained(
                pretrained_path=os.path.join(
                    root_file_path, "AudioSynth/ljspeech_mapper.json"))
            mels, alignment_history, audios = do_synthesis(
                "Hello, how can I help you today?", self.tacotron2,
                self.melgan_stft, "TACOTRON", "MELGAN-STFT", self.processor)

    def synthesize_text_to_speech(self, lines):
        if (Text2SpeechModel == "dc_tts"):
            char2idx, idx2char = load_vocab()

            sents = [text_normalize(lines) + "E"]
            texts = np.zeros((len(sents), hp.max_N), np.int32)
            for i, sent in enumerate(sents):
                texts[i, :len(sent)] = [char2idx[char] for char in sent]
            gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1,
                                        allow_growth=True)
            with tf.Session(config=tf.ConfigProto(
                    gpu_options=gpu_options)) as sess:
                sess.run(tf.global_variables_initializer())

                # Restore parameters
                var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                             'Text2Mel')
                saver1 = tf.train.Saver(var_list=var_list)
                saver1.restore(sess,
                               tf.train.latest_checkpoint(hp.logdir + "-1"))
                print("Text2Mel Restored!")

                var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'SSRN') + \
                        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'gs')
                saver2 = tf.train.Saver(var_list=var_list)
                saver2.restore(sess,
                               tf.train.latest_checkpoint(hp.logdir + "-2"))
                print("SSRN Restored!")

                # Feed Forward
                ## mel
                L = texts
                Y = np.zeros((len(L), hp.max_T, hp.n_mels), np.float32)
                prev_max_attentions = np.zeros((len(L), ), np.int32)

                for j in tqdm(range(hp.max_T)):
                    _gs, _Y, _max_attentions, _alignments = \
                        sess.run([self.g.global_step, self.g.Y, self.g.max_attentions, self.g.alignments],
                                {self.g.L: L,
                                self.g.mels: Y,
                                self.g.prev_max_attentions: prev_max_attentions})
                    Y[:, j, :] = _Y[:, j, :]
                    prev_max_attentions = _max_attentions[:, j]

                # Get magnitude
                Z = sess.run(self.g.Z, {self.g.Y: Y})

                # Generate wav files
                #if not os.path.exists(hp.sampledir): os.makedirs(hp.sampledir)
                for i, mag in enumerate(Z):
                    wav = spectrogram2wav(mag)
                    #write(hp.sampledir + "/{}.wav".format(i+1), hp.sr, wav)
                    save_path = os.path.join(
                        os.path.abspath(os.path.dirname(__file__)),
                        "OUT/temp.wav")
                    end = np.zeros((22050))
                    wav = np.concatenate((wav, end), axis=0)
                    write(save_path, hp.sr, wav)
        elif Text2SpeechModel == "RTVC":
            text = lines
            # The synthesizer works in batch, so you need to put your data in a list or numpy array
            texts = [text]
            # If you know what the attention layer alignments are, you can retrieve them here by
            # passing return_alignments=True
            specs = self.synthesizer.synthesize_spectrograms(
                texts, self.embeds)
            spec = specs[0]
            generated_wav = vocoder.infer_waveform(spec)
            generated_wav = np.pad(generated_wav,
                                   (0, self.synthesizer.sample_rate),
                                   mode="constant")
            save_path = os.path.join(
                os.path.abspath(os.path.dirname(__file__)), "OUT/temp.wav")
            end = np.zeros((22050))
            generated_wav = np.concatenate((generated_wav, end), axis=0)
            librosa.output.write_wav(save_path,
                                     generated_wav.astype(np.float32),
                                     self.synthesizer.sample_rate)
            clear_session()
        elif (Text2SpeechModel == "AudioSynth"):
            mels, alignment_history, audios = do_synthesis(
                lines, self.tacotron2, self.melgan_stft, "TACOTRON",
                "MELGAN-STFT", self.processor)
            end = np.zeros((22050))
            audios = np.concatenate((audios, end), axis=0)
            write(os.path.join(root_file_path, "OUT/temp.wav"), 22050,
                  audios.astype(np.float32))