def change_mode(character: str = "Human_Man", tone: str = "neutral"):

    training_dir = voices_dict[character]['ID']
    tone_file = voices_dict[character]['tone'][tone] + '.flac'
    tone_dir = tone_file.split("-")[1]
    local_infpath = Path(f'{data_path}/{training_dir}/{tone_dir}/{tone_file}')

    global in_fpath, filenum, preprocessed_wav, embed, torch, vocoder
    if local_infpath != in_fpath and character is not None:
        if tone is None:
            tone = "neutral"

        print(
            f'Reference sound has changed; now loading {character}:{tone}...')
        with nostdout():
            in_fpath = local_infpath

            preprocessed_wav = encoder.preprocess_wav(in_fpath)
            original_wav, sampling_rate = librosa.load(str(in_fpath))
            preprocessed_wav = encoder.preprocess_wav(original_wav,
                                                      sampling_rate)

            embed = encoder.embed_utterance(preprocessed_wav)
            torch.manual_seed(seed)
            vocoder.load_model(vocoder_path)
            text_to_speech('Tea.', play_sound=False)
    else:
        print('Mode is already correct. No need to change.')
Example #2
0
 def load_models(self):
     if not torch.cuda.is_available():
             print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
                   "for deep learning, ensure that the drivers are properly installed, and that your "
                   "CUDA version matches your PyTorch installation. CPU-only inference is currently "
                   "not supported.", file=sys.stderr)
             quit(-1)
     device_id = torch.cuda.current_device()
     gpu_properties = torch.cuda.get_device_properties(device_id)
     print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
           "%.1fGb total memory.\n" % 
           (torch.cuda.device_count(),
            device_id,
            gpu_properties.name,
            gpu_properties.major,
            gpu_properties.minor,
            gpu_properties.total_memory / 1e9))
 
 
     ## Load the models one by one.
     print("Preparing the encoder, the synthesizer and the vocoder...")
     encoder.load_model(self.enc_model_fpath)
     print("Loaded Encoder")
     self.synthesizer = Synthesizer(self.syn_model_dir.joinpath("taco_pretrained"), low_mem=self.low_mem)
     print("Loaded Synth")
     vocoder.load_model(self.voc_model_fpath)
     print("Loaded Vocoder")
Example #3
0
def load_model(in_fpath, parser):

	parser.add_argument("-e", "--enc_model_fpath", type=Path, 
		        default="encoder/saved_models/pretrained.pt",
		        help="Path to a saved encoder")
	parser.add_argument("-s", "--syn_model_dir", type=Path, 
		        default="synthesizer/saved_models/logs-pretrained/",
		        help="Directory containing the synthesizer model")
	parser.add_argument("-v", "--voc_model_fpath", type=Path, 
		        default="vocoder/saved_models/pretrained/pretrained.pt",
		        help="Path to a saved vocoder")
	parser.add_argument("--low_mem", action="store_true", help=\
	"If True, the memory used by the synthesizer will be freed after each use. Adds large "
	"overhead but allows to save some GPU memory for lower-end GPUs.")
	parser.add_argument("--no_sound", action="store_true", help=\
	"If True, audio won't be played.")
	args = parser.parse_args()
	encoder.load_model(args.enc_model_fpath)
	synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem)
	vocoder.load_model(args.voc_model_fpath)

	preprocessed_wav = encoder.preprocess_wav(in_fpath)
	original_wav, sampling_rate = librosa.load(in_fpath)
	preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
	embed = encoder.embed_utterance(preprocessed_wav)
	
	return synthesizer, sampling_rate, embed
Example #4
0
    def __init__(self):
        if (Text2SpeechModel == "dc_tts"):
            self.g = Graph(mode="synthesize")
            print("Text2Speech Tensorflow Graph loaded")
        elif (Text2SpeechModel == "RTVC"):
            enc_model_fpath = os.path.join(
                root_file_path, "RTVC", "encoder/saved_models/pretrained.pt")
            syn_model_dir = os.path.join(
                root_file_path, "RTVC",
                "synthesizer/saved_models/logs-pretrained")
            voc_model_fpath = os.path.join(
                root_file_path, "RTVC",
                "vocoder/saved_models/pretrained/pretrained.pt")
            encoder.load_model(enc_model_fpath)
            self.synthesizer = Synthesizer(os.path.join(
                syn_model_dir, "taco_pretrained"),
                                           low_mem=False)
            vocoder.load_model(voc_model_fpath)
            in_fpath = os.path.join("/",
                                    *root_file_path.split("/")[:-1],
                                    "REF/refaudioRTVC/ref.wav")
            preprocessed_wav = encoder.preprocess_wav(in_fpath)
            original_wav, sampling_rate = librosa.load(in_fpath)
            preprocessed_wav = encoder.preprocess_wav(original_wav,
                                                      sampling_rate)
            embed = encoder.embed_utterance(preprocessed_wav)
            self.embeds = [embed]
        elif (Text2SpeechModel == "AudioSynth"):
            taco_pretrained_config_path = os.path.join(
                root_file_path,
                'AudioSynth/TensorFlowTTS/examples/tacotron2/conf/tacotron2.v1.yaml'
            )
            tacotron2_config = AutoConfig.from_pretrained(
                taco_pretrained_config_path)
            taco_path = os.path.join(root_file_path,
                                     "AudioSynth/tacotron2-120k.h5")
            self.tacotron2 = TFAutoModel.from_pretrained(
                config=tacotron2_config,
                pretrained_path=taco_path,
                training=False,
                name="tacotron2")

            melgan_stft_pretrained_config_path = os.path.join(
                root_file_path,
                'AudioSynth/TensorFlowTTS/examples/melgan.stft/conf/melgan.stft.v1.yaml'
            )
            melgan_stft_config = AutoConfig.from_pretrained(
                melgan_stft_pretrained_config_path)
            melgan_stft_path = os.path.join(root_file_path,
                                            "AudioSynth/melgan.stft-2M.h5")
            self.melgan_stft = TFAutoModel.from_pretrained(
                config=melgan_stft_config,
                pretrained_path=melgan_stft_path,
                name="melgan_stft")
            self.processor = AutoProcessor.from_pretrained(
                pretrained_path=os.path.join(
                    root_file_path, "AudioSynth/ljspeech_mapper.json"))
            mels, alignment_history, audios = do_synthesis(
                "Hello, how can I help you today?", self.tacotron2,
                self.melgan_stft, "TACOTRON", "MELGAN-STFT", self.processor)
Example #5
0
def setup():
    global synthesizer
    encoder_weights = Path("encoder/saved_models/pretrained.pt")
    vocoder_weights = Path("vocoder/saved_models/pretrained/pretrained.pt")
    syn_dir = Path("synthesizer/saved_models/logs-pretrained/taco_pretrained")
    encoder.load_model(encoder_weights)
    synthesizer = Synthesizer(syn_dir)
    vocoder.load_model(vocoder_weights)
Example #6
0
def load_models():
    #encoder_weights = Path(encoder_path)
    vocoder_weights = Path(vocoder_path)
    syn_dir = Path(synthesizer_path)
    #encoder.load_model(encoder_weights)
    synthesizer = Synthesizer(syn_dir)
    vocoder.load_model(vocoder_weights)

    return encoder, synthesizer, vocoder
Example #7
0
async def generate_wav(text, filename):
    user_id = "russell"
    embed_path = "user_data/embeds/{}.npy".format(user_id)
    embed_path = Path(embed_path)

    if embed_path.is_file():
        embed = np.load(embed_path)
        print("load embedding in {}".format(embed_path))
    else:
        raise ("user embedding not found")

    # ================== synthesizer ==================
    start_time = time.time()

    # The synthesizer works in batch, so you need to put your data in a list or numpy array
    texts = [text]
    embeds = [embed]
    # If you know what the attention layer alignments are, you can retrieve them here by
    # passing return_alignments=True
    specs = synthesizer.synthesize_spectrograms(texts, embeds)
    spec = specs[0]
    print("Created the mel spectrogram")

    print("--- synthesizer: %s seconds ---" % (time.time() - start_time))

    # ================== vocoder ==================
    start_time = time.time()

    # If seed is specified, reset torch seed and reload vocoder
    if args.seed is not None:
        torch.manual_seed(args.seed)
        vocoder.load_model(args.voc_model_fpath)

    # Synthesizing the waveform is fairly straightforward. Remember that the longer the
    # spectrogram, the more time-efficient the vocoder.
    generated_wav = vocoder.infer_waveform(spec)
    print("")
    print("--- vocoder: %s seconds ---" % (time.time() - start_time))

    # ================== post generation ==================
    start_time = time.time()

    # There's a bug with sounddevice that makes the audio cut one second earlier, so we
    # pad it.
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                           mode="constant")

    # Trim excess silences to compensate for gaps in spectrograms (issue #53)
    generated_wav = encoder.preprocess_wav(generated_wav)
    print("--- post generation: %s seconds ---" % (time.time() - start_time))

    sf.write("./user_data/generated_voice/%s/"%(user_id) + "%s.wav"%filename, \
            generated_wav.astype(np.float32), synthesizer.sample_rate)
Example #8
0
    def init_vocoder(self):
        model_fpath = self.ui.current_vocoder_fpath
        # Case of Griffin-lim
        if model_fpath is None:
            return

        self.ui.log("Loading the vocoder %s... " % model_fpath)
        self.ui.set_loading(1)
        start = timer()
        vocoder.load_model(model_fpath)
        self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
        self.ui.set_loading(0)
Example #9
0
    def __init__(self):
        # Info & args
        enc_model_fpath = Path("encoder/saved_models/pretrained.pt")

        syn_model_dir = Path("synthesizer/saved_models/logs-pretrained/")
        voc_model_fpath = Path("vocoder/saved_models/pretrained/pretrained.pt")
        low_mem = False

        ## Load the models one by one.
        print("Preparing the encoder, the synthesizer and the vocoder...")
        encoder.load_model(enc_model_fpath)
        self.synthesizer = Synthesizer(syn_model_dir.joinpath("taco_pretrained"), low_mem=low_mem)
        vocoder.load_model(voc_model_fpath)
Example #10
0
def DeepTalk_vocoder(synthesized_mel, breaks, model_save_path, normalize=True):
    vocoder.load_model(model_save_path)
    no_action = lambda *args: None
    wav1 = vocoder.infer_waveform(synthesized_mel,
                                  progress_callback=no_action,
                                  normalize=normalize)

    # Add breaks
    b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
    b_starts = np.concatenate(([0], b_ends[:-1]))
    wavs = [wav1[start:end] for start, end, in zip(b_starts, b_ends)]
    breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
    wav1 = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
    wav1 = wav1 / np.abs(wav1).max() * 0.97
    return wav1
Example #11
0
    def initialize(self):
        print("Running a test of your configuration...\n")
        if not torch.cuda.is_available():
            print(
                "Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
                "for deep learning, ensure that the drivers are properly installed, and that your "
                "CUDA version matches your PyTorch installation. CPU-only inference is currently "
                "not supported.")
            quit(-1)
        print("PyTorch is available and working...")
        device_id = torch.cuda.current_device()
        gpu_properties = torch.cuda.get_device_properties(device_id)
        print(
            "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
            "%.1fGb total memory.\n" %
            (torch.cuda.device_count(), device_id, gpu_properties.name,
             gpu_properties.major, gpu_properties.minor,
             gpu_properties.total_memory / 1e9))
        ## Load the models one by one.

        print("Preparing the encoder, the synthesizer and the vocoder...")
        encoder.load_model(self.enc_model_fpath)

        vocoder.load_model(self.voc_model_fpath)

        ## Run a test
        print("Testing your configuration with small inputs.")
        print("\tTesting the encoder...")
        encoder.embed_utterance(np.zeros(encoder.sampling_rate))

        embed = np.random.rand(speaker_embedding_size)
        embed /= np.linalg.norm(embed)
        embeds = [embed, np.zeros(speaker_embedding_size)]
        texts = ["test 1", "test 2"]
        print(
            "\tTesting the synthesizer... (loading the model will output a lot of text)"
        )
        mels = self.synthesizer.synthesize_spectrograms(texts, embeds)

        mel = np.concatenate(mels, axis=1)
        no_action = lambda *args: None
        print("\tTesting the vocoder...")
        vocoder.infer_waveform(mel,
                               target=200,
                               overlap=50,
                               progress_callback=no_action)
        print("All test passed! You can now synthesize speech.\n\n")
Example #12
0
 def init_vocoder(self):
     model_fpath = self.ui.current_vocoder_fpath
     # Case of Griffin-lim
     if model_fpath is None:
         return
     else:
         self.ui.log("Loading the vocoder %s... " % model_fpath)
         self.ui.set_loading(1)
         start = timer()
         if Path(model_fpath).parent.stem == "melgan":
             vocoder_melgan.load_vocoder_melgan(model_fpath)
         elif Path(model_fpath).parent.stem == "wavernn":
             vocoder.load_model(model_fpath)
         else:
             return
         self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
         self.ui.set_loading(0)
Example #13
0
    def clone_voice(self, embed):
        synthesizer = Synthesizer("synthesizer/saved_models/logs-pretrained/taco_pretrained")
        vocoder.load_model("vocoder/saved_models/pretrained/pretrained.pt")
        with open(self.json_text) as text_json:
            data = json.load(text_json)
            for x in data:
                text = x['translation']
                # The synthesizer works in batch, so you need to put your data in a list or numpy array
                texts = [text]
                embeds = [embed]
                # If you know what the attention layer alignments are, you can retrieve them here by
                # passing return_alignments=True
                specs = synthesizer.synthesize_spectrograms(texts, embeds)
                spec = specs[0]

                ## Generating the waveform
                print("\nSynthesizing the waveform:")
                # Synthesizing the waveform is fairly straightforward. Remember that the longer the
                # spectrogram, the more time-efficient the vocoder.
                generated_wav = vocoder.infer_waveform(spec)

                ## Post-generation
                # There's a bug with sounddevice that makes the audio cut one second earlier, so we
                # pad it.
                generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")

                # Save it on the disk
                output_dir = '../temp'
                try:
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                except:
                    pass
                fpath = "%s/%d.wav" % (output_dir, x['index'])
                generated_wav *= 32767 / max(0.01, np.max(np.abs(generated_wav)))
                wavfile.write(fpath, synthesizer.sample_rate, generated_wav.astype(np.int16))
Example #14
0
            (torch.cuda.device_count(),
            device_id,
            gpu_properties.name,
            gpu_properties.major,
            gpu_properties.minor,
            gpu_properties.total_memory / 1e9))
    else:
        print("Using CPU for inference.\n")
    
    ## Remind the user to download pretrained models if needed
    
    ## Load the models one by one.
    print("Preparing the encoder, the synthesizer and the vocoder...")
    encoder.load_model(Path("encoder/saved_models/pretrained.pt"))
    synthesizer = Synthesizer(Path("synthesizer/saved_models/logs-pretrained/taco_pretrained"))
    vocoder.load_model(Path("vocoder/saved_models/pretrained/pretrained.pt"))

    
    try:
        # Get the reference audio filepath
        message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \
                    "wav, m4a, flac, ...):\n"
        in_fpath = Path("samples/elon_voice.wav") #hardcoded for now


        ## Computing the embedding
        # First, we load the wav using the function that the speaker encoder provides. This is 
        # important: there is preprocessing that must be applied.
        
        # The following two methods are equivalent:
        # - Directly load from the filepath:
Example #15
0
    embeddings = json.load(f)

# celebrities = ['Kevin Hart','Morgan Freeman','Tom Cruise']
celebrities = embeddings.keys()

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

encoder_weights = Path("encoder/saved_models/pretrained.pt")
vocoder_weights = Path("vocoder/saved_models/pretrained/pretrained.pt")
syn_dir = Path("synthesizer/saved_models/logs-pretrained/taco_pretrained")
encoder.load_model(encoder_weights)
synthesizer = Synthesizer(syn_dir)
vocoder.load_model(vocoder_weights)

outfile = "/content/drive/My Drive/Real-Time-Voice-Cloning/samples/morgan-freeman-to-me-it's-just-a-made-up-word-a-politician's-word-so-that-young-fellas-like-yourself-can-wear-a-suit-and-a-tie-and-have-a-job.wav"
in_fpath = Path(outfile)
print("preprocessing the training audio file")
# reprocessed_wav = encoder.preprocess_wav(in_fpath)
original_wav, sampling_rate = librosa.load(in_fpath)
preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
embed = encoder.embed_utterance(preprocessed_wav)


def write(f, sr, x, normalized=False):
    """numpy array to MP3"""
    channels = 2 if (x.ndim == 2 and x.shape[1] == 2) else 1
    if normalized:  # normalized array - each item should be a float in [-1, 1)
        y = np.int16(x * 2**15)
Example #16
0
enc_model_fpath = Path("encoder/saved_models/pretrained.pt")
syn_model_dir = Path("synthesizer/saved_models/logs-pretrained/")
voc_model_fpath = Path("vocoder/saved_models/pretrained/pretrained.pt")

# Load the models one by one


## Load the models one by one.
print("Preparing the encoder, the synthesizer and the vocoder...")
# encoder.load_model(args.enc_model_fpath)
# synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem)
# vocoder.load_model(args.voc_model_fpath)

encoder.load_model(enc_model_fpath)
synthesizer = Synthesizer(syn_model_dir.joinpath("taco_pretrained"))
vocoder.load_model(voc_model_fpath)

# Run a test

print("Testing your configuration with small inputs.")
# Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
# sampling rate, which may differ.
# If you're unfamiliar with digital audio, know that it is encoded as an array of floats
# (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
# The sampling rate is the number of values (samples) recorded per second, it is set to
# 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
# to an audio of 1 second.
print("\tTesting the encoder...")
encoder.embed_utterance(np.zeros(encoder.sampling_rate))

# Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
Example #17
0
def maux(output_text, num):

    print("debug -- django")

    ## Info & args
    # parser = argparse.ArgumentParser(
    #     formatter_class=argparse.ArgumentDefaultsHelpFormatter
    # )
    # parser.add_argument("-e", "--enc_model_fpath", type=Path,
    #                     default="D:/RemindMe/django-remindme/mysite/trained model/encoder/saved_models/pretrained.pt",
    #                     help="Path to a saved encoder")
    # parser.add_argument("-s", "--syn_model_dir", type=Path,
    #                     default="D:/RemindMe/django-remindme/mysite/trained model/synthesizer/saved_models/logs-pretrained/",
    #                     help="Directory containing the synthesizer model")
    # parser.add_argument("-v", "--voc_model_fpath", type=Path,
    #                     default="D:/RemindMe/django-remindme/mysite/trained model/vocoder/saved_models/pretrained/pretrained.pt",
    #                     help="Path to a saved vocoder")
    # parser.add_argument("--low_mem", action="store_true", help=\
    #     "If True, the memory used by the synthesizer will be freed after each use. Adds large "
    #     "overhead but allows to save some GPU memory for lower-end GPUs.")
    # parser.add_argument("--no_sound", action="store_true", help=\
    #     "If True, audio won't be played.")
    # args = parser.parse_args()
    # print_args(args, parser)
    # if not args.no_sound:
    #     import sounddevice as sd

    ## Print some environment information (for debugging purposes)
    print("Running a test of your configuration...\n")
    if not torch.cuda.is_available():
        print(
            "Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
            "for deep learning, ensure that the drivers are properly installed, and that your "
            "CUDA version matches your PyTorch installation. CPU-only inference is currently "
            "not supported.",
            file=sys.stderr)
        quit(-1)
    device_id = torch.cuda.current_device()
    gpu_properties = torch.cuda.get_device_properties(device_id)
    print(
        "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
        "%.1fGb total memory.\n" %
        (torch.cuda.device_count(), device_id, gpu_properties.name,
         gpu_properties.major, gpu_properties.minor,
         gpu_properties.total_memory / 1e9))

    ## Load the models one by one.
    print("Preparing the encoder, the synthesizer and the vocoder...")
    #encoder.load_model(args.enc_model_fpath)
    #synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem)
    #vocoder.load_model(args.voc_model_fpath)
    encoder.load_model(
        "D:/RemindMe/django-remindme/mysite/trained model/encoder/saved_models/pretrained.pt"
    )

    synthesizer = Synthesizer(
        "D:/RemindMe/django-remindme/mysite/trained model/synthesizer/saved_models/logs-pretrained/taco_pretrained",
        low_mem=False)

    vocoder.load_model(
        "D:/RemindMe/django-remindme/mysite/trained model/vocoder/saved_models/pretrained/pretrained.pt"
    )

    ## Run a test
    print("Testing your configuration with small inputs.")
    # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
    # sampling rate, which may differ.
    # If you're unfamiliar with digital audio, know that it is encoded as an array of floats
    # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
    # The sampling rate is the number of values (samples) recorded per second, it is set to
    # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
    # to an audio of 1 second.
    print("\tTesting the encoder...")
    encoder.embed_utterance(np.zeros(encoder.sampling_rate))

    # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
    # returns, but here we're going to make one ourselves just for the sake of showing that it's
    # possible.
    embed = np.random.rand(speaker_embedding_size)
    # Embeddings are L2-normalized (this isn't important here, but if you want to make your own
    # embeddings it will be).
    embed /= np.linalg.norm(embed)
    # The synthesizer can handle multiple inputs with batching. Let's create another embedding to
    # illustrate that
    embeds = [embed, np.zeros(speaker_embedding_size)]
    texts = ["test 1", "test 2"]
    print(
        "\tTesting the synthesizer... (loading the model will output a lot of text)"
    )
    mels = synthesizer.synthesize_spectrograms(texts, embeds)

    # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
    # can concatenate the mel spectrograms to a single one.
    mel = np.concatenate(mels, axis=1)
    # The vocoder can take a callback function to display the generation. More on that later. For
    # now we'll simply hide it like this:
    no_action = lambda *args: None
    print("\tTesting the vocoder...")
    # For the sake of making this test short, we'll pass a short target length. The target length
    # is the length of the wav segments that are processed in parallel. E.g. for audio sampled
    # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
    # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
    # that has a detrimental effect on the quality of the audio. The default parameters are
    # recommended in general.
    vocoder.infer_waveform(mel,
                           target=200,
                           overlap=50,
                           progress_callback=no_action)

    print("All test passed! You can now synthesize speech.\n\n")

    ## Interactive speech generation
    print(
        "This is a GUI-less example of interface to SV2TTS. The purpose of this script is to "
        "show how you can interface this project easily with your own. See the source code for "
        "an explanation of what is happening.\n")

    print("Interactive generation loop")

    in_fpath = Path(
        "D:/RemindMe/django-remindme/mysite/trained model/sam_narration2.wav")
    preprocessed_wav = encoder.preprocess_wav(in_fpath)
    original_wav, sampling_rate = librosa.load(in_fpath)
    preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
    print("Loaded file succesfully")
    embed = encoder.embed_utterance(preprocessed_wav)
    print("Created the embedding")
    embeds = [embed]

    text = output_text
    texts = [text]

    specs = synthesizer.synthesize_spectrograms(texts, embeds)
    spec = specs[0]
    print("Created the mel spectrogram")

    ## Generating the waveform
    print("Synthesizing the waveform:")

    # Synthesizing the waveform is fairly straightforward. Remember that the longer the
    # spectrogram, the more time-efficient the vocoder.
    generated_wav = vocoder.infer_waveform(spec)

    ## Post-generation
    # There's a bug with sounddevice that makes the audio cut one second earlier, so we
    # pad it.
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                           mode="constant")

    # Play the audio (non-blocking)

    # Save it on the disk
    filexpath = "D:/RemindMe/django_remindme_model/mysite/media/demo_output_%02d.wav" % num
    fx = "demo_output_%02d" % num
    print(generated_wav.dtype)
    librosa.output.write_wav(filexpath, generated_wav.astype(np.float32),
                             synthesizer.sample_rate)

    print("\nSaved output as %s\n\n" % filexpath)

    return fx
def clone(audio=None, audio_url=None, sentence=""):
    try:
        if not 10 <= len(sentence.split(" ")) <= 30:
            return {"error": "Sentence is invalid! (length must be 10 to 30 words)"}
        audio_data = audio
        if audio_url:
            # Link
            if "http://" in audio_url or "https://" in audio_url:
                header = {'User-Agent': 'Mozilla/5.0 (Windows NT x.y; Win64; x64; rv:9.0) Gecko/20100101 Firefox/10.0'}
                # Check if audio file has less than 5Mb
                r = requests.head(audio_url, headers=header, allow_redirects=True)
                size = r.headers.get('content-length', 0)
                size = int(size) / float(1 << 20)
                log.info("File size: {:.2f} Mb".format(size))
                if size > 10:
                    return {"error": "Input audio file is too large! (max 10Mb)"}
                r = requests.get(audio_url, headers=header, allow_redirects=True)
                audio_data = r.content
            # Base64
            elif len(audio_url) > 500:
                audio_data = base64.b64decode(audio_url)

        audio_path = generate_uid() + ".audio"
        with open(audio_path, "wb") as f:
            f.write(audio_data)

        # Load the models one by one.
        log.info("Preparing the encoder, the synthesizer and the vocoder...")
        encoder.load_model(Path("rtvc/encoder/saved_models/pretrained.pt"))
        synthesizer = Synthesizer(Path("rtvc/synthesizer/saved_models/logs-pretrained/taco_pretrained"))
        vocoder.load_model(Path("rtvc/vocoder/saved_models/pretrained/pretrained.pt"))

        # Computing the embedding
        original_wav, sampling_rate = librosa.load(audio_path)
        preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
        log.info("Loaded file successfully")

        if os.path.exists(audio_path):
            os.remove(audio_path)

        embed = encoder.embed_utterance(preprocessed_wav)
        log.info("Created the embedding")

        specs = synthesizer.synthesize_spectrograms([sentence], [embed])
        spec = np.concatenate(specs, axis=1)
        # spec = specs[0]
        log.info("Created the mel spectrogram")

        # Generating the waveform
        log.info("Synthesizing the waveform:")
        generated_wav = vocoder.infer_waveform(spec, progress_callback=lambda *args: None)

        # Post-generation
        # There's a bug with sounddevice that makes the audio cut one second earlier, so we
        # pad it.
        generated_wav = np.pad(generated_wav,
                               (0, synthesizer.sample_rate),
                               mode="constant")

        # Save it on the disk
        fp = tempfile.TemporaryFile()
        librosa.output.write_wav(fp, generated_wav.astype(np.float32), synthesizer.sample_rate)
        return {"audio": fp.read()}

    except Exception as e:
        log.error(e)
        traceback.print_exc()
        return {"error": "Fail"}
global in_fpath, filenum, preprocessed_wav, embed, torch, vocoder
filenum = 0

data_path = '/Users/glw001/Documents/Development/voice_clone/LibriSpeech/train-clean-100'
in_fpath = Path(f'{data_path}/F1088-Christabel/134315/1088-134315-0002.flac')

seed = 694201312

word_substitutions = {'do': 'doo', 'Do': 'Doo', 'NPC': 'En Pee See'}

## Load the models one by one.
print("Preparing the encoder, the synthesizer and the vocoder...")
encoder.load_model(encoder_path)
synthesizer = Synthesizer(synthesizer_path.joinpath("taco_pretrained"),
                          seed=seed)
vocoder.load_model(vocoder_path)

preprocessed_wav = encoder.preprocess_wav(in_fpath)
original_wav, sampling_rate = librosa.load(str(in_fpath))
preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)

embed = encoder.embed_utterance(preprocessed_wav)
torch.manual_seed(seed)
vocoder.load_model(vocoder_path)


def word_replace(text: str):
    text = " " + text + " "
    for word in word_substitutions:
        regex = f'\s({word})[\.|\s|\!|\?]'
        word_match = re.findall(regex, text)
def run_voice_cloning():
    ## Model locations
    enc_model_fpath = Path("encoder/saved_models/pretrained.pt")
    syn_model_dir = Path("synthesizer/saved_models/logs-pretrained/")
    voc_model_fpath = Path("vocoder/saved_models/pretrained/pretrained.pt")
    ref_voice_path = request.json["voiceFile"]  # filename like ojo3.wav
    messages = request.json["messages"]  # array of strings
    low_mem = request.json[
        "low_mem"] if "low_mem" in request.json else False  # whether to use LowMem Mode

    # Base64 encode the parameters so that we can reference this job in later api calls
    dataToEncodeAsID = ','.join(messages) + ref_voice_path
    encodedBytes = base64.b64encode(dataToEncodeAsID.encode("utf-8"))
    req_id = str(encodedBytes, "utf-8")
    # Md5 Hash it so that it is a consistent length
    req_id = hashlib.md5(req_id.encode('utf-8')).hexdigest()

    # Clear destination folder of generated sound files
    output_path = "/output/%s/" % req_id
    if os.path.exists(output_path):
        shutil.rmtree(output_path)
    os.makedirs(output_path)

    ## Print some environment information (for debugging purposes)
    print("Running a test of your configuration...\n")
    if not torch.cuda.is_available():
        print(
            "Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
            "for deep learning, ensure that the drivers are properly installed, and that your "
            "CUDA version matches your PyTorch installation. CPU-only inference is currently "
            "not supported.",
            file=sys.stderr)
        return abort(500)
    device_id = torch.cuda.current_device()
    gpu_properties = torch.cuda.get_device_properties(device_id)
    print(
        "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
        "%.1fGb total memory.\n" %
        (torch.cuda.device_count(), device_id, gpu_properties.name,
         gpu_properties.major, gpu_properties.minor,
         gpu_properties.total_memory / 1e9))

    ## Load the models one by one.
    print("Preparing the encoder, the synthesizer and the vocoder...")
    encoder.load_model(enc_model_fpath)
    synthesizer = Synthesizer(syn_model_dir.joinpath("taco_pretrained"),
                              low_mem=low_mem)
    vocoder.load_model(voc_model_fpath)

    in_fpath = Path(ref_voice_path)

    print("Computing the embedding")
    ## Computing the embedding
    # First, we load the wav using the function that the speaker encoder provides. This is
    # important: there is preprocessing that must be applied.

    # The following two methods are equivalent:
    # - Directly load from the filepath:
    preprocessed_wav = encoder.preprocess_wav(in_fpath)
    # - If the wav is already loaded:
    original_wav, sampling_rate = librosa.load(in_fpath)
    preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
    print("Loaded file succesfully")

    # Then we derive the embedding. There are many functions and parameters that the
    # speaker encoder interfaces. These are mostly for in-depth research. You will typically
    # only use this function (with its default parameters):
    embed = encoder.embed_utterance(preprocessed_wav)
    print("Created the embedding")

    print("Generation loop")
    num_generated = 0
    fpath = None
    for text in messages:
        try:
            ## Generating the spectrogram
            # The synthesizer works in batch, so you need to put your data in a list or numpy array
            texts = [text]
            embeds = [embed]
            # If you know what the attention layer alignments are, you can retrieve them here by
            # passing return_alignments=True
            specs = synthesizer.synthesize_spectrograms(texts, embeds)
            spec = specs[0]
            print("Created the mel spectrogram")

            ## Generating the waveform
            print("Synthesizing the waveform:")
            # Synthesizing the waveform is fairly straightforward. Remember that the longer the
            # spectrogram, the more time-efficient the vocoder.
            generated_wav = vocoder.infer_waveform(spec)

            ## Post-generation
            # There's a bug with sounddevice that makes the audio cut one second earlier, so we
            # pad it.
            generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                                   mode="constant")

            # Save it on the disk
            fpath = output_path + ("output_%03d.wav" % num_generated)
            print(generated_wav.dtype)
            librosa.output.write_wav(fpath, generated_wav.astype(np.float32),
                                     synthesizer.sample_rate)
            num_generated += 1
            print("\nSaved output as %s\n\n" % fpath)

        except Exception as e:
            print("Caught exception: %s" % repr(e))
            print("Restarting\n")

    return req_id
Example #21
0
            "not supported.",
            file=sys.stderr)
        quit(-1)
    device_id = torch.cuda.current_device()
    gpu_properties = torch.cuda.get_device_properties(device_id)
    print(
        "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
        "%.1fGb total memory.\n" %
        (torch.cuda.device_count(), device_id, gpu_properties.name,
         gpu_properties.major, gpu_properties.minor,
         gpu_properties.total_memory / 1e9))

    encoder.load_model(Path('encoder/saved_models/pretrained.pt'))
    synthesizer = Synthesizer(
        Path("synthesizer/saved_models/logs-pretrained/taco_pretrained"))
    vocoder.load_model(Path('vocoder/saved_models/pretrained/pretrained.pt'))

    voice = 'voices/peabody/voice.wav'

    try:
        preprocessed_wav = encoder.preprocess_wav(voice)
        embed = encoder.embed_utterance(preprocessed_wav)

        text = "Hello Carina. Hello Carina Hello Carina Hello Carina Hello Carina Hello Carina Hello Carina This is Kevin Smith, happy new translation around the Sun."

        texts = [text]
        embeds = [embed]

        specs = synthesizer.synthesize_spectrograms(texts, embeds)
        spec = specs[0]
def hello_world():
    legoutput = upload_file()
    lig = "This is a demo utterance. This will work when you do not add any utterance."
    if request.method == 'POST':
        lig = request.form["textarea"]
    print(str(lig))
    #return mainpage()
    if str(legoutput) == "None":
        return render_template("index.html", output="")
    else:
        from encoder.params_model import model_embedding_size as speaker_embedding_size
        from utils.argutils import print_args
        from synthesizer.inference import Synthesizer
        from encoder import inference as encoder
        from vocoder import inference as vocoder
        from pathlib import Path
        import numpy as np
        import librosa
        import argparse
        import torch
        try:
            parser = argparse.ArgumentParser(
                formatter_class=argparse.ArgumentDefaultsHelpFormatter)
            parser.add_argument("-e",
                                "--enc_model_fpath",
                                type=Path,
                                default="encoder/saved_models/pretrained.pt")
            parser.add_argument(
                "-s",
                "--syn_model_dir",
                type=Path,
                default="synthesizer/saved_models/logs-pretrained/")
            parser.add_argument(
                "-v",
                "--voc_model_fpath",
                type=Path,
                default="vocoder/saved_models/pretrained/pretrained.pt")
            parser.add_argument("--low_mem", action="store_true")
            #parser.add_argument("--no_sound", action="store_true")
            args = parser.parse_args()
            print_args(args, parser)
            #if not args.no_sound:
            #    import sounddevice as sd
            encoder.load_model(args.enc_model_fpath)
            synthesizer = Synthesizer(
                args.syn_model_dir.joinpath("taco_pretrained"),
                low_mem=args.low_mem)
            vocoder.load_model(args.voc_model_fpath)
            num_generated = 0
            in_fpath = legoutput[1]
            print(str(in_fpath))
            preprocessed_wav = encoder.preprocess_wav(in_fpath)
            original_wav, sampling_rate = librosa.load(in_fpath)
            preprocessed_wav = encoder.preprocess_wav(original_wav,
                                                      sampling_rate)
            embed = encoder.embed_utterance(preprocessed_wav)
            print("Created the embedding")
            text = str(lig)
            texts = [text]
            embeds = [embed]
            specs = synthesizer.synthesize_spectrograms(texts, embeds)
            spec = specs[0]
            print("Created the mel spectrogram")
            print("Synthesizing the waveform:")
            generated_wav = vocoder.infer_waveform(spec)
            generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                                   mode="constant")
            #if not args.no_sound:
            #    sd.stop()
            #    sd.play(generated_wav, synthesizer.sample_rate)
            fpath = "static/output.wav"
            print(generated_wav.dtype)
            librosa.output.write_wav(fpath, generated_wav.astype(np.float32),
                                     synthesizer.sample_rate)
            print("\nSaved output as %s\n\n" % fpath)
            return render_template("index.html",
                                   output=htmloader(text, legoutput[1], fpath))
        except Exception as e:
            return render_template("index.html",
                                   output="Caught exception: %s" % repr(e))
import warnings


app = flask.Flask(__name__)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Ok to hard code these locations
encoder_model_path = '/opt/ml/model/encoder/saved_models/pretrained.pt'
synthesizer_path = '/opt/ml/model/synthesizer/saved_models/logs-pretrained/taco_pretrained/'
vocoder_model_path = '/opt/ml/model/vocoder/saved_models/pretrained/pretrained.pt'

# Load the models
encoder.load_model(Path(encoder_model_path))
synthesizer = Synthesizer(Path(synthesizer_path))
vocoder.load_model(Path(vocoder_model_path))


def clone_voice(sentence, results_file):
    """Adapted from 'demo_cli.py'"""
    u_path = Path('utterance.wav')
    results_path = Path(results_file)
    
    preprocessed_wav = encoder.preprocess_wav(u_path)
    embed = encoder.embed_utterance(preprocessed_wav)
    specs = synthesizer.synthesize_spectrograms([sentence], [embed])
    generated_wav = vocoder.infer_waveform(specs[0])
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
    
    librosa.output.write_wav(results_path, generated_wav.astype(np.float32), 
                             synthesizer.sample_rate)
Example #24
0
 def test_config(self):
     ## Print some environment information (for debugging purposes)
     print("Running a test of your configuration...\n")
     try:
         if not torch.cuda.is_available():
             print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
                   "for deep learning, ensure that the drivers are properly installed, and that your "
                   "CUDA version matches your PyTorch installation. CPU-only inference is currently "
                   "not supported.", file=sys.stderr)
             quit(-1)
         device_id = torch.cuda.current_device()
         gpu_properties = torch.cuda.get_device_properties(device_id)
         print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
               "%.1fGb total memory.\n" % 
               (torch.cuda.device_count(),
                device_id,
                gpu_properties.name,
                gpu_properties.major,
                gpu_properties.minor,
                gpu_properties.total_memory / 1e9))
     
     
         ## Load the models one by one.
         print("Preparing the encoder, the synthesizer and the vocoder...")
         encoder.load_model(self.enc_model_fpath)
         print("Loaded Encoder")
         self.synthesizer = Synthesizer(self.syn_model_dir.joinpath("taco_pretrained"), low_mem=self.low_mem)
         print("Loaded Synth")
         vocoder.load_model(self.voc_model_fpath)
         print("Loaded Vocoder")
         
         ## Run a test
         print("Testing your configuration with small inputs.")
         # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
         # sampling rate, which may differ.
         # If you're unfamiliar with digital audio, know that it is encoded as an array of floats 
         # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
         # The sampling rate is the number of values (samples) recorded per second, it is set to
         # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond 
         # to an audio of 1 second.
         print("\tTesting the encoder...")
         encoder.embed_utterance(np.zeros(encoder.sampling_rate))
         
         # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
         # returns, but here we're going to make one ourselves just for the sake of showing that it's
         # possible.
         embed = np.random.rand(speaker_embedding_size)
         # Embeddings are L2-normalized (this isn't important here, but if you want to make your own 
         # embeddings it will be).
         embed /= np.linalg.norm(embed)
         # The synthesizer can handle multiple inputs with batching. Let's create another embedding to 
         # illustrate that
         embeds = [embed, np.zeros(speaker_embedding_size)]
         texts = ["test 1", "test 2"]
         print("\tTesting the synthesizer... (loading the model will output a lot of text)")
         mels = self.synthesizer.synthesize_spectrograms(texts, embeds)
         
         # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We 
         # can concatenate the mel spectrograms to a single one.
         mel = np.concatenate(mels, axis=1)
         # The vocoder can take a callback function to display the generation. More on that later. For 
         # now we'll simply hide it like this:
         no_action = lambda *args: None
         print("\tTesting the vocoder...")
         # For the sake of making this test short, we'll pass a short target length. The target length 
         # is the length of the wav segments that are processed in parallel. E.g. for audio sampled 
         # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
         # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and 
         # that has a detrimental effect on the quality of the audio. The default parameters are 
         # recommended in general.
         vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
         
         print("\tAll test passed!")
         
         return("All test passed!")
         
     except Exception as e:
         return("Caught exception: %s" % repr(e))
Example #25
0
 gpu_properties = torch.cuda.get_device_properties(device_id)
 print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
       "%.1fGb total memory.\n" % 
       (torch.cuda.device_count(),
        device_id,
        gpu_properties.name,
        gpu_properties.major,
        gpu_properties.minor,
        gpu_properties.total_memory / 1e9))
 
 
 ## Load the models one by one.
 print("Preparing the encoder, the synthesizer and the vocoder...")
 encoder.load_model(args.enc_model_fpath)
 synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem)
 vocoder.load_model(args.voc_model_fpath)
 
 
 ## Run a test
 print("Testing your configuration with small inputs.")
 # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
 # sampling rate, which may differ.
 # If you're unfamiliar with digital audio, know that it is encoded as an array of floats 
 # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
 # The sampling rate is the number of values (samples) recorded per second, it is set to
 # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond 
 # to an audio of 1 second.
 print("\tTesting the encoder...")
 encoder.embed_utterance(np.zeros(encoder.sampling_rate))
 
 # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
Example #26
0
def run_DeepTalk_demo(ref_audio_path='samples/ref_VCTKp240.wav',
                      output_text='Hello World',
                      enc_model_fpath=config.enc_model_fpath,
                      enc_module_name=config.enc_module_name,
                      syn_model_dir=config.syn_model_dir,
                      voc_model_fpath=config.voc_model_fpath,
                      key_embed=None):
    class hyperparameter:
        def __init__(self):

            self.enc_model_fpath = enc_model_fpath
            self.enc_module_name = enc_module_name
            self.syn_model_dir = syn_model_dir
            self.voc_model_fpath = voc_model_fpath

            self.enc_normalize = False
            self.voc_normalize = True
            self.low_mem = False  # "If True, the memory used by the synthesizer will be freed after each use. Adds large "
            # "overhead but allows to save some GPU memory for lower-end GPUs."
            self.no_sound = False  # If True, audio won't be played.
            self.sampling_rate = 16000  ## 16000: For mel-spectrogram based methods; 8000: For fCNN base methods
            self.ref_audio_path = ref_audio_path
            self.output_text = output_text

    args = hyperparameter()

    ## Load trained models: Encoder, Synthesizer, and Vocoder
    # os.environ["CUDA_VISIBLE_DEVICES"] = '0'
    encoder.load_model(args.enc_model_fpath, module_name=args.enc_module_name)
    synthesizer = Synthesizer(args.syn_model_dir, low_mem=args.low_mem)
    vocoder.load_model(args.voc_model_fpath)

    ## Encoding stage
    print('---------------------------------------------------------------')
    print('Stage 1/3: Encoder')
    print('---------------------------------------------------------------')
    wav = Synthesizer.load_preprocess_wav(args.ref_audio_path)
    ref_audio = encoder.preprocess_wav(wav)

    embed, partial_embeds, _ = encoder.embed_utterance(ref_audio,
                                                       using_partials=True,
                                                       return_partials=True,
                                                       key_embed=key_embed)
    if (args.enc_normalize):
        embed = embed / np.linalg.norm(embed)

    if (embed.shape[0] == 128):
        embed = np.concatenate((embed, embed), axis=0)

    ## Synthesizing stage
    print('---------------------------------------------------------------')
    print('Stage 2/3: Synthesizer')
    print('---------------------------------------------------------------')
    texts = args.output_text
    # texts = re.split(',|.',texts)
    texts = re.split(r'[,.]\s*', texts)
    texts[:] = [x for x in texts if x]
    print(texts)
    # texts = texts.split("\n")
    # texts = texts.split(".")
    # texts = texts.split(",")
    embeds = np.stack([embed] * len(texts))
    specs = synthesizer.synthesize_spectrograms(texts, embeds)
    breaks = [spec.shape[1] for spec in specs]
    synthesized_mel = np.concatenate(specs, axis=1)

    ## Vocoding stage
    print('---------------------------------------------------------------')
    print('Stage 3/3: Vocoder')
    print('---------------------------------------------------------------')
    no_action = lambda *args: None
    wav1 = vocoder.infer_waveform(synthesized_mel,
                                  progress_callback=no_action,
                                  normalize=args.voc_normalize)
    # Add breaks
    b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
    b_starts = np.concatenate(([0], b_ends[:-1]))
    wavs = [wav1[start:end] for start, end, in zip(b_starts, b_ends)]
    breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
    wav1 = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
    synthesized_wav = wav1 / np.abs(wav1).max() * 0.97

    return synthesized_wav, Synthesizer.sample_rate, embed
Example #27
0
def voice_cloning(audio_file, text, enc_model_fpath, syn_model_dir,
                  voc_model_fpath, low_mem):
    ## Print some environment information (for debugging purposes)
    print("Running a test of your configuration...\n")
    if not torch.cuda.is_available():
        print(
            "Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
            "for deep learning, ensure that the drivers are properly installed, and that your "
            "CUDA version matches your PyTorch installation. CPU-only inference is currently "
            "not supported.",
            file=sys.stderr)
        quit(-1)
    device_id = torch.cuda.current_device()
    gpu_properties = torch.cuda.get_device_properties(device_id)
    print(
        "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
        "%.1fGb total memory.\n" %
        (torch.cuda.device_count(), device_id, gpu_properties.name,
         gpu_properties.major, gpu_properties.minor,
         gpu_properties.total_memory / 1e9))

    ## Load the models one by one.
    print("Preparing the encoder, the synthesizer and the vocoder...")
    encoder.load_model(enc_model_fpath)
    synthesizer = Synthesizer(syn_model_dir.joinpath("taco_pretrained"),
                              low_mem=low_mem)
    vocoder.load_model(voc_model_fpath)

    ## Run a test
    print("Testing your configuration with small inputs.")
    # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
    # sampling rate, which may differ.
    # If you're unfamiliar with digital audio, know that it is encoded as an array of floats
    # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
    # The sampling rate is the number of values (samples) recorded per second, it is set to
    # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
    # to an audio of 1 second.
    print("\tTesting the encoder...")
    encoder.embed_utterance(np.zeros(encoder.sampling_rate))

    # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
    # returns, but here we're going to make one ourselves just for the sake of showing that it's
    # possible.
    embed = np.random.rand(speaker_embedding_size)
    # Embeddings are L2-normalized (this isn't important here, but if you want to make your own
    # embeddings it will be).
    embed /= np.linalg.norm(embed)
    # The synthesizer can handle multiple inputs with batching. Let's create another embedding to
    # illustrate that
    embeds = [embed, np.zeros(speaker_embedding_size)]
    texts = ["test 1", "test 2"]
    print(
        "\tTesting the synthesizer... (loading the model will output a lot of text)"
    )
    mels = synthesizer.synthesize_spectrograms(texts, embeds)

    # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
    # can concatenate the mel spectrograms to a single one.
    mel = np.concatenate(mels, axis=1)
    # The vocoder can take a callback function to display the generation. More on that later. For
    # now we'll simply hide it like this:
    no_action = lambda *args: None
    print("\tTesting the vocoder...")
    # For the sake of making this test short, we'll pass a short target length. The target length
    # is the length of the wav segments that are processed in parallel. E.g. for audio sampled
    # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
    # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
    # that has a detrimental effect on the quality of the audio. The default parameters are
    # recommended in general.
    vocoder.infer_waveform(mel,
                           target=200,
                           overlap=50,
                           progress_callback=no_action)

    print("All test passed! You can now synthesize speech.\n\n")

    ## Interactive speech generation
    print(
        "This is a GUI-less example of interface to SV2TTS. The purpose of this script is to "
        "show how you can interface this project easily with your own. See the source code for "
        "an explanation of what is happening.\n")

    print("Interactive generation loop")
    num_generated = 0
    while True:
        try:
            # Get the reference audio filepath
            # message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \
            #           "wav, m4a, flac, ...):\n"
            # in_fpath = Path(input(message).replace("\"", "").replace("\'", ""))
            in_fpath = Path(audio_file.replace("\"", "").replace("\'", ""))

            ## Computing the embedding
            # First, we load the wav using the function that the speaker encoder provides. This is
            # important: there is preprocessing that must be applied.

            # The following two methods are equivalent:
            # - Directly load from the filepath:
            preprocessed_wav = encoder.preprocess_wav(in_fpath)
            # - If the wav is already loaded:
            original_wav, sampling_rate = librosa.load(in_fpath)
            preprocessed_wav = encoder.preprocess_wav(original_wav,
                                                      sampling_rate)
            print("Loaded file succesfully")

            # Then we derive the embedding. There are many functions and parameters that the
            # speaker encoder interfaces. These are mostly for in-depth research. You will typically
            # only use this function (with its default parameters):
            embed = encoder.embed_utterance(preprocessed_wav)
            print("Created the embedding")

            ## Generating the spectrogram
            # text = input("Write a sentence (+-20 words) to be synthesized:\n")
            print('\n\nThe text to convert to speech: ', text)
            text = text

            # The synthesizer works in batch, so you need to put your data in a list or numpy array
            texts = [text]
            embeds = [embed]
            # If you know what the attention layer alignments are, you can retrieve them here by
            # passing return_alignments=True
            specs = synthesizer.synthesize_spectrograms(texts, embeds)
            spec = specs[0]
            print("Created the mel spectrogram")

            ## Generating the waveform
            print("Synthesizing the waveform:")
            # Synthesizing the waveform is fairly straightforward. Remember that the longer the
            # spectrogram, the more time-efficient the vocoder.
            generated_wav = vocoder.infer_waveform(spec)

            ## Post-generation
            # There's a bug with sounddevice that makes the audio cut one second earlier, so we
            # pad it.
            generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                                   mode="constant")

            # # Play the audio (non-blocking)
            # if not args.no_sound:
            #     sd.stop()
            #     sd.play(generated_wav, synthesizer.sample_rate)

            # Save it on the disk
            fpath = "demo_output_%02d.wav" % num_generated
            print(generated_wav.dtype)
            librosa.output.write_wav(fpath, generated_wav.astype(np.float32),
                                     synthesizer.sample_rate)
            num_generated += 1
            print("\nSaved output as %s\n\n" % fpath)

        except Exception as e:
            print("Caught exception: %s" % repr(e))
            print("Restarting\n")
import numpy as np
from pathlib import Path
from scipy.io import wavfile
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder


SAMPLE_RATE = 22050
embedding = None


# loading Models
encoder.load_model(BASE_PATH_VOICE_CLONE / Path("encoder/saved_models/pretrained.pt"))
synthesizer = Synthesizer(BASE_PATH_VOICE_CLONE / Path("synthesizer/saved_models/logs-pretrained/taco_pretrained"))
vocoder.load_model(BASE_PATH_VOICE_CLONE / Path("vocoder/saved_models/pretrained/pretrained.pt"))
print("All models Load Sucessfully")


# In[4]:


import librosa

def _compute_embedding(audio):
    '''
    Description 
        Loading Embedding from the audio file to clone
        
    Input:
        audio: Audio File