Beispiel #1
0
async def create_upload_file(text: str):
    texts = [text]
    embeds = [embed]
    # If you know what the attention layer alignments are, you can retrieve them here by
    # passing return_alignments=True
    specs = synthesizer.synthesize_spectrograms(texts, embeds)
    spec = specs[0]
    print("Created the mel spectrogram")

    # Generating the waveform
    print("Synthesizing the waveform:")

    # Synthesizing the waveform is fairly straightforward. Remember that the longer the
    # spectrogram, the more time-efficient the vocoder.
    generated_wav = vocoder.infer_waveform(spec)

    # Post-generation
    # There's a bug with sounddevice that makes the audio cut one second earlier, so we
    # pad it.
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                           mode="constant")

    # Trim excess silences to compensate for gaps in spectrograms (issue #53)
    generated_wav = encoder.preprocess_wav(generated_wav)

    # Save it on the disk
    output = BytesIO()
    wavfile.write(output, synthesizer.sample_rate,
                  generated_wav.astype(np.float32))
    return StreamingResponse(output, media_type="audio/x-wav")
Beispiel #2
0
    def vocode(self):
        speaker_name, spec, breaks, _ = self.current_generated
        assert spec is not None

        # Synthesize the waveform
        if not vocoder.is_loaded():
            self.init_vocoder()

        def vocoder_progress(i, seq_len, b_size, gen_rate):
            real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
            line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
                   % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
            self.ui.log(line, "overwrite")
            self.ui.set_loading(i, seq_len)

        if self.ui.current_vocoder_fpath is not None:
            self.ui.log("")
            wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress)
        else:
            self.ui.log("Waveform generation with Griffin-Lim... ")
            wav = Synthesizer.griffin_lim(spec)
        self.ui.set_loading(0)
        self.ui.log(" Done!", "append")

        # Add breaks
        b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
        b_starts = np.concatenate(([0], b_ends[:-1]))
        wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
        breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
        wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])

        # Play it
        wav = wav / np.abs(wav).max() * 0.97
        self.ui.play(wav, Synthesizer.sample_rate)

        fref = '-'.join([self.ui.current_dataset_name, self.ui.current_speaker_name, self.ui.current_utterance_name])
        ftime = '{}'.format(int(time.time()))
        ftext = self.ui.text_prompt.toPlainText()
        fms = int(len(wav) * 1000 / Synthesizer.sample_rate)
        fname = filename_formatter('{}_{}_{}ms_{}.wav'.format(fref, ftime, fms, ftext))
        audio.save_wav(wav, _out_wav_dir.joinpath(fname), Synthesizer.sample_rate)  # save

        # Compute the embedding
        # TODO: this is problematic with different sampling rates, gotta fix it
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)

        # Add the utterance
        name = speaker_name + "_gen_%05d" % int(time.time())
        utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True)

        np.save(_out_embed_dir.joinpath(name + '.npy'), embed, allow_pickle=False)  # save

        self.utterances.add(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "generated")
        self.ui.draw_umap_projections(self.utterances)
def text_to_speech(text: str, play_sound: bool = True):
    start = timer()
    texts = [text]
    embeds = [embed]
    print("Creating the MEL spectrogram")
    with nostdout():
        specs = synthesizer.synthesize_spectrograms(texts, embeds)
        spec = specs[0]

    print("Generating audio")

    with nostdout():
        generated_wav = vocoder.infer_waveform(spec)
        generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate))

        # Trim excess silences to compensate for gaps in spectrograms (issue #53)
        generated_wav = encoder.preprocess_wav(generated_wav)

    filename = f'tts_generated{filenum}.wav'
    sf.write(filename, generated_wav.astype(np.float32),
             synthesizer.sample_rate)

    print(text)
    if play_sound:
        os.system(f'afplay {filename} &')
    elapsed_time = timer() - start
    print(f'Generated in {elapsed_time}')
Beispiel #4
0
    def vocode(self):
        speaker_name, spec, breaks, _ = self.current_generated
        assert spec is not None

        # Synthesize the waveform
        if not vocoder.is_loaded():
            self.init_vocoder()

        def vocoder_progress(i, seq_len, b_size, gen_rate):
            real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
            line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
                   % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
            self.ui.log(line, "overwrite")
            self.ui.set_loading(i, seq_len)

        if self.ui.current_vocoder_fpath is not None:
            self.ui.log("")
            wav = vocoder.infer_waveform(spec,
                                         progress_callback=vocoder_progress)
        else:
            self.ui.log("Waveform generation with Griffin-Lim... ")
            wav = Synthesizer.griffin_lim(spec)
        self.ui.set_loading(0)
        self.ui.log(" Done!", "append")

        # Add breaks
        b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
        b_starts = np.concatenate(([0], b_ends[:-1]))
        wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
        breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
        wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])

        # Play it
        wav = wav / np.abs(wav).max() * 0.97
        self.ui.play(wav, Synthesizer.sample_rate)

        self.ui.save_button.setDisabled(False)

        # Compute the embedding
        # TODO: this is problematic with different sampling rates, gotta fix it
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(
            encoder_wav, return_partials=True)

        # Add the utterance
        if not speaker_name is None:
            name = speaker_name
        else:
            name = "unknown"
        name = name + "_gen_%05d" % np.random.randint(100000)
        utterance = Utterance(name, speaker_name, wav, spec, embed,
                              partial_embeds, True)
        self.utterances.add(utterance)
        self.ui.register_utterance(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "generated")
        self.ui.draw_umap_projections(self.utterances)
Beispiel #5
0
    def generate_voice(self, in_fpath, text, out_fpath):
        try:
            preprocessed_wav = encoder.preprocess_wav(in_fpath)
            original_wav, sampling_rate = librosa.load(in_fpath)
            preprocessed_wav = encoder.preprocess_wav(original_wav,
                                                      sampling_rate)
            print("Loaded file successfully")

            embed = encoder.embed_utterance(preprocessed_wav)
            print("Created the embedding")

            texts = [text]
            embeds = [embed]
            # If you know what the attention layer alignments are, you can retrieve them here by
            # passing return_alignments=True
            specs = self.synthesizer.synthesize_spectrograms(texts, embeds)
            spec = specs[0]
            print("Created the mel spectrogram")
            ## Generating the waveform
            print("Synthesizing the waveform:")
            generated_wav = vocoder.infer_waveform(spec)

            generated_wav = np.pad(generated_wav,
                                   (0, self.synthesizer.sample_rate),
                                   mode="constant")
            librosa.output.write_wav(out_fpath,
                                     generated_wav.astype(np.float32),
                                     self.synthesizer.sample_rate)
            print("\nSaved output as %s\n\n" % out_fpath)

        except Exception as e:
            print("Caught exception: %s" % repr(e))
            print("Restarting\n")
Beispiel #6
0
def synth(text, audio_file):
    """

    Parameters
    ----------
    text : string
        text to be said in synthesized voice
    audio_file : filepath
        filepath for audio file in wav format

    Returns
    -------
    generated_wav : numpy.ndarray
        Numpy padded array of synthesized audio signal

    """

    in_fpath = Path("audio.wav")
    original_wav, sampling_rate = librosa.load(in_fpath)
    preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
    embed = encoder.embed_utterance(preprocessed_wav)
    print("Synthesizing new audio...")
    specs = synthesizer.synthesize_spectrograms([text], [embed])
    generated_wav = vocoder.infer_waveform(specs[0])
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                           mode="constant")

    return generated_wav
Beispiel #7
0
    def initialize(self):
        print("Running a test of your configuration...\n")
        if not torch.cuda.is_available():
            print(
                "Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
                "for deep learning, ensure that the drivers are properly installed, and that your "
                "CUDA version matches your PyTorch installation. CPU-only inference is currently "
                "not supported.")
            quit(-1)
        print("PyTorch is available and working...")
        device_id = torch.cuda.current_device()
        gpu_properties = torch.cuda.get_device_properties(device_id)
        print(
            "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
            "%.1fGb total memory.\n" %
            (torch.cuda.device_count(), device_id, gpu_properties.name,
             gpu_properties.major, gpu_properties.minor,
             gpu_properties.total_memory / 1e9))
        ## Load the models one by one.

        print("Preparing the encoder, the synthesizer and the vocoder...")
        encoder.load_model(self.enc_model_fpath)

        vocoder.load_model(self.voc_model_fpath)

        ## Run a test
        print("Testing your configuration with small inputs.")
        print("\tTesting the encoder...")
        encoder.embed_utterance(np.zeros(encoder.sampling_rate))

        embed = np.random.rand(speaker_embedding_size)
        embed /= np.linalg.norm(embed)
        embeds = [embed, np.zeros(speaker_embedding_size)]
        texts = ["test 1", "test 2"]
        print(
            "\tTesting the synthesizer... (loading the model will output a lot of text)"
        )
        mels = self.synthesizer.synthesize_spectrograms(texts, embeds)

        mel = np.concatenate(mels, axis=1)
        no_action = lambda *args: None
        print("\tTesting the vocoder...")
        vocoder.infer_waveform(mel,
                               target=200,
                               overlap=50,
                               progress_callback=no_action)
        print("All test passed! You can now synthesize speech.\n\n")
 def synthesize(embed, text):
     print("Synthesizing new audio...")
     #with io.capture_output() as captured:
     specs = synthesizer.synthesize_spectrograms([text], [embed])
     generated_wav = vocoder.infer_waveform(specs[0])
     generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
     print(type(generated_wav))
     return 
Beispiel #9
0
def synthesize(embed, text):
    specs = synthesizer.synthesize_spectrograms([text], [embed])
    generated_wav = vocoder.infer_waveform(specs[0])
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                           mode="constant")

    print(synthesize.sample_rate)
    sf.write('infer5.wav', generated_wav, synthesizer.sample_rate, 'PCM_24')
Beispiel #10
0
def synthesize_and_save(inpath, outpath):
    spec=np.load(inpath).T
    generated_wav = vocoder.infer_waveform(spec)
    ## Post-generation
    # There's a bug with sounddevice that makes the audio cut one second earlier, so we
    # pad it.
    generated_wav = np.pad(generated_wav, (0, 16000), mode="constant")
    # Save it on the disk
    librosa.output.write_wav(outpath, generated_wav.astype(np.float32), 16000)
    pass
Beispiel #11
0
async def generate_wav(text, filename):
    user_id = "russell"
    embed_path = "user_data/embeds/{}.npy".format(user_id)
    embed_path = Path(embed_path)

    if embed_path.is_file():
        embed = np.load(embed_path)
        print("load embedding in {}".format(embed_path))
    else:
        raise ("user embedding not found")

    # ================== synthesizer ==================
    start_time = time.time()

    # The synthesizer works in batch, so you need to put your data in a list or numpy array
    texts = [text]
    embeds = [embed]
    # If you know what the attention layer alignments are, you can retrieve them here by
    # passing return_alignments=True
    specs = synthesizer.synthesize_spectrograms(texts, embeds)
    spec = specs[0]
    print("Created the mel spectrogram")

    print("--- synthesizer: %s seconds ---" % (time.time() - start_time))

    # ================== vocoder ==================
    start_time = time.time()

    # If seed is specified, reset torch seed and reload vocoder
    if args.seed is not None:
        torch.manual_seed(args.seed)
        vocoder.load_model(args.voc_model_fpath)

    # Synthesizing the waveform is fairly straightforward. Remember that the longer the
    # spectrogram, the more time-efficient the vocoder.
    generated_wav = vocoder.infer_waveform(spec)
    print("")
    print("--- vocoder: %s seconds ---" % (time.time() - start_time))

    # ================== post generation ==================
    start_time = time.time()

    # There's a bug with sounddevice that makes the audio cut one second earlier, so we
    # pad it.
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                           mode="constant")

    # Trim excess silences to compensate for gaps in spectrograms (issue #53)
    generated_wav = encoder.preprocess_wav(generated_wav)
    print("--- post generation: %s seconds ---" % (time.time() - start_time))

    sf.write("./user_data/generated_voice/%s/"%(user_id) + "%s.wav"%filename, \
            generated_wav.astype(np.float32), synthesizer.sample_rate)
def clone_voice(sentence, results_file):
    """Adapted from 'demo_cli.py'"""
    u_path = Path('utterance.wav')
    results_path = Path(results_file)
    
    preprocessed_wav = encoder.preprocess_wav(u_path)
    embed = encoder.embed_utterance(preprocessed_wav)
    specs = synthesizer.synthesize_spectrograms([sentence], [embed])
    generated_wav = vocoder.infer_waveform(specs[0])
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
    
    librosa.output.write_wav(results_path, generated_wav.astype(np.float32), 
                             synthesizer.sample_rate)
Beispiel #13
0
def text_callback(bot, update):

    spectrogram = synthesizer.synthesize_spectrograms([update.message.text],
                                                      [embedding])
    wav = vocoder.infer_waveform(spectrogram[0])
    wav = np.pad(wav, (0, synthesizer.sample_rate), mode="constant")

    librosa.output.write_wav(f"{output_path}{update.effective_chat.id}.wav",
                             wav,
                             sr=synthesizer.sample_rate)
    bot.send_voice(chat_id=update.effective_chat.id,
                   voice=open(f"{output_path}{update.effective_chat.id}.wav",
                              "rb"),
                   timeout=100)
Beispiel #14
0
def generate_wav(text, num_generated, synthesizer, sampling_rate, embed, debug = False):
	texts = [text]
	embeds = [embed]
	specs = synthesizer.synthesize_spectrograms(texts, embeds)
	spec = specs[0]
	generated_wav = vocoder.infer_waveform(spec, True, False)
	generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
	print("zeros=", np.count_nonzero(generated_wav==0), "\n")
	fpath = "output_%02d.wav" % num_generated
	librosa.output.write_wav(fpath, generated_wav.astype(np.float32), 
			     synthesizer.sample_rate)
	if debug:
		sd.stop()
		sd.play(generated_wav, synthesizer.sample_rate)
Beispiel #15
0
def audioFromEmbeds(filename, embed):
    textPath = dataPath + "/" + filename + ".txt"
    #reading the text file and prepare the text string for the synthesizer
    textFile = open(textPath)
    text = textFile.read().replace("\n", " ")
    textFile.close()

    #synthesize the text together with the embeds
    specs = synthesizer.synthesize_spectrograms([text], [embed])

    #generate the audio using the vocoder
    generated_wav = vocoder.infer_waveform(specs[0])

    return np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
Beispiel #16
0
    def gen_audio(self, ref_audio, text):
        try:
            in_fpath = Path(ref_audio.replace("\"", "").replace("\'", ""))

            ## Computing the embedding
            # First, we load the wav using the function that the speaker encoder provides. This is
            # important: there is preprocessing that must be applied.

            # The following two methods are equivalent:
            # - Directly load from the filepath:
            # preprocessed_wav = encoder.preprocess_wav(in_fpath)
            # - If the wav is already loaded:
            original_wav, sampling_rate = librosa.load(in_fpath, sr=None)
            preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
            print("Loaded file succesfully, rate=%s" % sampling_rate)

            # Then we derive the embedding. There are many functions and parameters that the
            # speaker encoder interfaces. These are mostly for in-depth research. You will typically
            # only use this function (with its default parameters):
            embed = encoder.embed_utterance(preprocessed_wav)
            print("Created the embedding")

            ## Generating the spectrogram
            # The synthesizer works in batch, so you need to put your data in a list or numpy array
            texts = [text]
            embeds = [embed]
            # If you know what the attention layer alignments are, you can retrieve them here by
            # passing return_alignments=True
            specs = self.synthesizer.synthesize_spectrograms(texts, embeds)
            spec = specs[0]
            print("Created the mel spectrogram")

            ## Generating the waveform
            print("Synthesizing the waveform:")
            # Synthesizing the waveform is fairly straightforward. Remember that the longer the
            # spectrogram, the more time-efficient the vocoder.
            generated_wav = vocoder.infer_waveform(spec)

            ## Post-generation
            # There's a bug with sounddevice that makes the audio cut one second earlier, so we
            # pad it.
            # generated_wav = np.pad(generated_wav, (0, self.synthesizer.sample_rate), mode="constant")
            print("\n samples = %s @ %s" % (len(generated_wav), self.synthesizer.sample_rate))

            return generated_wav

        except Exception as e:
            traceback.print_exc()
            print("Caught exception: %s" % repr(e))
            print("Restarting\n")
Beispiel #17
0
def DeepTalk_vocoder(synthesized_mel, breaks, model_save_path, normalize=True):
    vocoder.load_model(model_save_path)
    no_action = lambda *args: None
    wav1 = vocoder.infer_waveform(synthesized_mel,
                                  progress_callback=no_action,
                                  normalize=normalize)

    # Add breaks
    b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
    b_starts = np.concatenate(([0], b_ends[:-1]))
    wavs = [wav1[start:end] for start, end, in zip(b_starts, b_ends)]
    breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
    wav1 = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
    wav1 = wav1 / np.abs(wav1).max() * 0.97
    return wav1
def tts(input_dict):
    '''
    Flow:
    0) Check if audio has embeddings (Not yet)
    1) Encode the audio
    2) Synthesizer the text with embeddings
    3) Vocoder out the fake wav
    '''
    # init
    output_dict = {"data": {}}

    # loop the input
    for audio_name, raw_audio in input_dict["data"].items():
        wav_name_no_ext = Path(audio_name).stem
        saved_path_obj = Path.cwd() / "data/output"

        # step 1
        print("Step 1")
        raw_audio_np, sample_rate = librosa.load(io.BytesIO(raw_audio))
        preprocessed_wav = encoder.preprocess_wav(raw_audio_np, sample_rate)
        embeddings = encoder.embed_utterance(preprocessed_wav)

        # step 2
        print("Step 2")
        splitted_text = input_dict["text"].split(".")
        clean_text_list = [text for text in splitted_text if len(text) > 0]
        if len(clean_text_list) == 0:
            raise Exception("Empty text field")
        sentence_count = len(clean_text_list)
        embeddings_list = [embeddings] * sentence_count
        specs = synthesizer.synthesize_spectrograms(clean_text_list,
                                                    embeddings_list)

        # step 3
        print("Step 3")
        for index, spec in enumerate(specs):
            generated_wav = vocoder.infer_waveform(spec)
            # needed to 1 second for playback capability
            generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                                   mode="constant")

            file_name = "{}_tts_{}.wav".format(wav_name_no_ext, index)
            file_path = saved_path_obj / file_name
            sf.write(str(file_path), generated_wav.astype(np.float32),
                     synthesizer.sample_rate, 'PCM_16')
            output_dict["data"][index] = file_path

    return output_dict
Beispiel #19
0
def generate():
    text_to_be_analyzed = request.get_json()["text"]
    session_client = dialogflow.SessionsClient()
    session = session_client.session_path(DIALOGFLOW_PROJECT_ID, SESSION_ID)
    text_input = dialogflow.types.TextInput(
        text=text_to_be_analyzed, language_code=DIALOGFLOW_LANGUAGE_CODE)
    query_input = dialogflow.types.QueryInput(text=text_input)
    try:
        response = session_client.detect_intent(session=session,
                                                query_input=query_input)
    except InvalidArgument:
        raise
    text = response.query_result.fulfillment_text
    in_fpath = Path("audio.wav")
    reprocessed_wav = encoder.preprocess_wav(in_fpath)
    original_wav, sampling_rate = librosa.load(in_fpath)
    preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
    embed = encoder.embed_utterance(preprocessed_wav)
    specs = synthesizer.synthesize_spectrograms([text], [embed])
    generated_wav = vocoder.infer_waveform(specs[0])
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                           mode="constant")
    generated_wav = encoder.preprocess_wav(generated_wav)
    if os.path.exists("temp.wav"):
        os.remove("temp.wav")
    else:
        print("The file does not exist")
    if os.path.exists("temp.mp3"):
        os.remove("temp.mp3")
    sf.write("temp.wav", generated_wav, synthesizer.sample_rate)
    AudioSegment.from_wav("temp.wav").export("temp.mp3", format="mp3")
    encoded_gen_wav_string = "data:audio/mp3;base64,"
    with open("temp.mp3", "rb") as f1:
        encoded_f1 = base64.b64encode(f1.read())
        encoded_gen_wav_string += str(encoded_f1, 'ascii', 'ignore')

    # encoded_gen_wav_bytes= base64.b64encode(generated_wav)
    # encoded_gen_wav_string = str(encoded_gen_wav_bytes,'ascii', 'ignore')

    res = {
        "data": encoded_gen_wav_string,
        "rate": synthesizer.sample_rate,
        "text": text
    }
    # sf.write("demo_output.wav", generated_wav.astype(np.float32), synthesizer.sample_rate)
    return jsonify(res), 200
Beispiel #20
0
def synth():
  text = "hey welcome to programming hut" #@param {type:"string"}
  print("Now recording for 10 seconds, say what you will...")
  
  ### record
  record(30)
  print("Audio recording complete")
  in_fpath = Path("audio.wav")
  reprocessed_wav = encoder.preprocess_wav(in_fpath)
  original_wav, sampling_rate = librosa.load(in_fpath)
  preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
  embed = encoder.embed_utterance(preprocessed_wav)
  print("Synthesizing new audio...")
  with io.capture_output() as captured:
    specs = synthesizer.synthesize_spectrograms([text], [embed])
  generated_wav = vocoder.infer_waveform(specs[0])
  generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
  display(Audio(generated_wav, rate=synthesizer.sample_rate))
Beispiel #21
0
    def convert(self, text, in_fpath, outfn):
        print(f"converting\ntext:\n {text}\n\n wavfn: {in_fpath}")
        print(f"outfn: {outfn}")

        try:
            preprocessed_wav = encoder.preprocess_wav(in_fpath)
            original_wav, sampling_rate = librosa.load(in_fpath)
            preprocessed_wav = encoder.preprocess_wav(original_wav,
                                                      sampling_rate)
            print("Loaded file succesfully")

            embed = encoder.embed_utterance(preprocessed_wav)
            print("Created the embedding")
            # The synthesizer works in batch, so you need to put your data in a list or numpy array
            texts = [text]
            embeds = [embed]
            # If you know what the attention layer alignments are, you can retrieve them here by
            # passing return_alignments=True
            specs = self.synthesizer.synthesize_spectrograms(texts, embeds)
            spec = specs[0]
            print("Created the mel spectrogram")

            ## Generating the waveform
            print("Synthesizing the waveform:")
            # Synthesizing the waveform is fairly straightforward. Remember that the longer the
            # spectrogram, the more time-efficient the vocoder.
            generated_wav = vocoder.infer_waveform(spec)

            #TODO: check this necessary
            generated_wav = np.pad(generated_wav,
                                   (0, self.synthesizer.sample_rate),
                                   mode="constant")

            #TODO: Save it on the disk?
            #fpath = "demo_output_%02d.wav" % num_generated
            print(generated_wav.dtype)
            librosa.output.write_wav(outfn, generated_wav.astype(np.float32),
                                     self.synthesizer.sample_rate)
            #num_generated += 1
            print("\nSaved output as %s\n\n" % outfn)
            return True
        except Exception as e:
            print("Caught exception: %s" % repr(e))
            return False
Beispiel #22
0
 def vocode(self, spec, breaks):
     ## Generating the waveform
     print("Synthesizing the waveform:")
     # Synthesizing the waveform is fairly straightforward. Remember that the longer the
     # spectrogram, the more time-efficient the vocoder.
     wav = vocoder.infer_waveform(spec)
     
     # Add breaks
     b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
     b_starts = np.concatenate(([0], b_ends[:-1]))
     wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
     breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
     wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
     
     ## Post-generation
     # There's a bug with sounddevice that makes the audio cut one second earlier, so we
     # pad it.
     wav = np.pad(wav, (0, self.synthesizer.sample_rate), mode="constant")
     return wav
Beispiel #23
0
def vocalize(n_clicks, celebrity, value):
    text = value
    embed = embeddings[celebrity]
    print("Synthesizing new audio...")
    # with io.capture_output() as captured:
    specs = synthesizer.synthesize_spectrograms([text], [embed])
    generated_wav = vocoder.infer_waveform(specs[0])
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                           mode="constant")
    audio = Audio(generated_wav, rate=synthesizer.sample_rate)
    # display(audio)
    # return json.dumps({'sample_rate':synthesizer.sample_rate,'audio':generated_wav.tolist()})

    write('generated_via_flask_api.mp3',
          synthesizer.sample_rate,
          generated_wav,
          normalized=True)

    folder_id = '1hOJ9GrsOHLRGe75YwLhS8tfb9zi2bvB9'
    file1 = drive.CreateFile({
        "parents": [{
            "kind": "drive#fileLink",
            "id": folder_id
        }],
        'title':
        'audio.mp3',
    })
    file1.SetContentFile('generated_via_flask_api.mp3')
    file1.Upload()
    # # Fetch permissions.
    permissions = file1.GetPermissions()
    permission = file1.InsertPermission({
        'type': 'anyone',
        'value': 'anyone',
        'role': 'reader'
    })
    token = permissions[0]['selfLink'].split('/')[-3]

    return "http://docs.google.com/uc?export=open&id=" + token
def synthesized_voice(text, speaker_name):
    """
    Parameters
    ----------
    text : string
        text to be said in synthesized voice
    audio_file : filepath
        filepath for audio file in wav format

    Returns
    -------
    generated_wav : numpy.ndarray
        Numpy padded array of synthesized audio signal

    """
    sample_dir = "src\samples\Original Samples"

    in_fpath = os.path.join(
        sample_dir, speaker_name + '.mp3'
    )  # Audio file to be synthesized, can be changed to audio file of choice, refer synthesizer.py
    reprocessed_wav = encoder.preprocess_wav(in_fpath)
    original_wav, sampling_rate = librosa.load(in_fpath)
    preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
    embed = encoder.embed_utterance(preprocessed_wav)
    print("Synthesizing new audio...")
    with io.capture_output() as captured:
        specs = synthesizer.synthesize_spectrograms([text], [embed])
    generated_wav = vocoder.infer_waveform(specs[0])
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                           mode="constant")
    print("Synthesized audio generated")

    ## For saving samples you can call save_audio_local
    ## save_audio_local(generated_wav, speaker_name, synthesizer.sample_rate)

    return generated_wav, synthesizer.sample_rate
Beispiel #25
0
    def clone_voice(self, embed):
        synthesizer = Synthesizer("synthesizer/saved_models/logs-pretrained/taco_pretrained")
        vocoder.load_model("vocoder/saved_models/pretrained/pretrained.pt")
        with open(self.json_text) as text_json:
            data = json.load(text_json)
            for x in data:
                text = x['translation']
                # The synthesizer works in batch, so you need to put your data in a list or numpy array
                texts = [text]
                embeds = [embed]
                # If you know what the attention layer alignments are, you can retrieve them here by
                # passing return_alignments=True
                specs = synthesizer.synthesize_spectrograms(texts, embeds)
                spec = specs[0]

                ## Generating the waveform
                print("\nSynthesizing the waveform:")
                # Synthesizing the waveform is fairly straightforward. Remember that the longer the
                # spectrogram, the more time-efficient the vocoder.
                generated_wav = vocoder.infer_waveform(spec)

                ## Post-generation
                # There's a bug with sounddevice that makes the audio cut one second earlier, so we
                # pad it.
                generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")

                # Save it on the disk
                output_dir = '../temp'
                try:
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                except:
                    pass
                fpath = "%s/%d.wav" % (output_dir, x['index'])
                generated_wav *= 32767 / max(0.01, np.max(np.abs(generated_wav)))
                wavfile.write(fpath, synthesizer.sample_rate, generated_wav.astype(np.int16))
def generate():
    """Generates wav file from text and source audio file"""
    if 'file' not in request.files:
        return jsonify({'error': 'no audio file'})
    text = request.form.get('text')
    if not text:
        return jsonify({'error': 'no text provided'})
    source_audio_file = request.files['file'].save('/tmp/original.wav')
    preprocessed_wav = encoder.preprocess_wav(Path('/tmp/original.wav'))
    original_wav, sampling_rate = librosa.load('/tmp/original.wav')
    preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
    embed = encoder.embed_utterance(preprocessed_wav)
    texts = [text]
    embeds = [embed]
    specs = synthesizer.synthesize_spectrograms(texts, embeds)
    spec = specs[0]
    generated_wav = vocoder.infer_waveform(spec)
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
    generated_wav = encoder.preprocess_wav(generated_wav)
    # filename = "/tmp/{}.wav".format(int(time.time()))
    fio = io.BytesIO()
    sf.write(fio, generated_wav.astype(np.float32), synthesizer.sample_rate, None, None, 'WAV')
    fio.seek(0)
    return send_file(fio, as_attachment=True, attachment_filename='generated.wav', mimetype='audio/wav')
Beispiel #27
0
def convert_voice():

    if request.method == 'POST':
        file = request.files['file']
        extension = os.path.splitext(file.filename)[1]

        f_name = str(uuid.uuid4()) + extension
        file.save(os.path.join(app.config['UPLOAD_FOLDER'], f_name))

        file_path = os.path.join(app.config['UPLOAD_FOLDER'], f_name)

        # @param {type:"string"}
        text = "This is being said in my own voice.  The computer has learned to do an impression of me."
        in_fpath = Path(
            "/home/naman/melnetCode/clone_in_5_sec/dataset/data_voice/test.wav"
        )
        reprocessed_wav = encoder.preprocess_wav(in_fpath)
        original_wav, sampling_rate = librosa.load(in_fpath)
        preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
        embed = encoder.embed_utterance(preprocessed_wav)
        print("Synthesizing new audio...")

        specs = synthesizer.synthesize_spectrograms([text], [embed])
        generated_wav = vocoder.infer_waveform(specs[0])
        generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                               mode="constant")
        # display(Audio(generated_wav, rate=synthesizer.sample_rate))
        librosa.output.write_wav('output.wav', generated_wav,
                                 synthesizer.sample_rate)

        path_to_file = "output.wav"

        return send_file(path_to_file,
                         mimetype="audio/wav",
                         as_attachment=True,
                         attachment_filename="output.wav")
Beispiel #28
0
    mels = synthesizer.synthesize_spectrograms(texts, embeds)
    
    # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We 
    # can concatenate the mel spectrograms to a single one.
    mel = np.concatenate(mels, axis=1)
    # The vocoder can take a callback function to display the generation. More on that later. For 
    # now we'll simply hide it like this:
    no_action = lambda *args: None
    print("\tTesting the vocoder...")
    # For the sake of making this test short, we'll pass a short target length. The target length 
    # is the length of the wav segments that are processed in parallel. E.g. for audio sampled 
    # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
    # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and 
    # that has a detrimental effect on the quality of the audio. The default parameters are 
    # recommended in general.
    vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
    
    print("All test passed! You can now synthesize speech.\n\n")
    
    
    ## Interactive speech generation
    print("This is a GUI-less example of interface to SV2TTS. The purpose of this script is to "
          "show how you can interface this project easily with your own. See the source code for "
          "an explanation of what is happening.\n")
    
    print("Interactive generation loop")
    num_generated = 0

    # Get the reference audio filepath
    #message = "Reference voice: enter an audio filepath of a voice to be cloned(Введите путь до клонируемого файла, например ex.wav) (mp3, " \
    #          "wav, m4a, flac, ...):\n"
def clone(audio=None, audio_url=None, sentence=""):
    try:
        if not 10 <= len(sentence.split(" ")) <= 30:
            return {"error": "Sentence is invalid! (length must be 10 to 30 words)"}
        audio_data = audio
        if audio_url:
            # Link
            if "http://" in audio_url or "https://" in audio_url:
                header = {'User-Agent': 'Mozilla/5.0 (Windows NT x.y; Win64; x64; rv:9.0) Gecko/20100101 Firefox/10.0'}
                # Check if audio file has less than 5Mb
                r = requests.head(audio_url, headers=header, allow_redirects=True)
                size = r.headers.get('content-length', 0)
                size = int(size) / float(1 << 20)
                log.info("File size: {:.2f} Mb".format(size))
                if size > 10:
                    return {"error": "Input audio file is too large! (max 10Mb)"}
                r = requests.get(audio_url, headers=header, allow_redirects=True)
                audio_data = r.content
            # Base64
            elif len(audio_url) > 500:
                audio_data = base64.b64decode(audio_url)

        audio_path = generate_uid() + ".audio"
        with open(audio_path, "wb") as f:
            f.write(audio_data)

        # Load the models one by one.
        log.info("Preparing the encoder, the synthesizer and the vocoder...")
        encoder.load_model(Path("rtvc/encoder/saved_models/pretrained.pt"))
        synthesizer = Synthesizer(Path("rtvc/synthesizer/saved_models/logs-pretrained/taco_pretrained"))
        vocoder.load_model(Path("rtvc/vocoder/saved_models/pretrained/pretrained.pt"))

        # Computing the embedding
        original_wav, sampling_rate = librosa.load(audio_path)
        preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
        log.info("Loaded file successfully")

        if os.path.exists(audio_path):
            os.remove(audio_path)

        embed = encoder.embed_utterance(preprocessed_wav)
        log.info("Created the embedding")

        specs = synthesizer.synthesize_spectrograms([sentence], [embed])
        spec = np.concatenate(specs, axis=1)
        # spec = specs[0]
        log.info("Created the mel spectrogram")

        # Generating the waveform
        log.info("Synthesizing the waveform:")
        generated_wav = vocoder.infer_waveform(spec, progress_callback=lambda *args: None)

        # Post-generation
        # There's a bug with sounddevice that makes the audio cut one second earlier, so we
        # pad it.
        generated_wav = np.pad(generated_wav,
                               (0, synthesizer.sample_rate),
                               mode="constant")

        # Save it on the disk
        fp = tempfile.TemporaryFile()
        librosa.output.write_wav(fp, generated_wav.astype(np.float32), synthesizer.sample_rate)
        return {"audio": fp.read()}

    except Exception as e:
        log.error(e)
        traceback.print_exc()
        return {"error": "Fail"}
Beispiel #30
0
 def test_config(self):
     ## Print some environment information (for debugging purposes)
     print("Running a test of your configuration...\n")
     try:
         if not torch.cuda.is_available():
             print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
                   "for deep learning, ensure that the drivers are properly installed, and that your "
                   "CUDA version matches your PyTorch installation. CPU-only inference is currently "
                   "not supported.", file=sys.stderr)
             quit(-1)
         device_id = torch.cuda.current_device()
         gpu_properties = torch.cuda.get_device_properties(device_id)
         print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
               "%.1fGb total memory.\n" % 
               (torch.cuda.device_count(),
                device_id,
                gpu_properties.name,
                gpu_properties.major,
                gpu_properties.minor,
                gpu_properties.total_memory / 1e9))
     
     
         ## Load the models one by one.
         print("Preparing the encoder, the synthesizer and the vocoder...")
         encoder.load_model(self.enc_model_fpath)
         print("Loaded Encoder")
         self.synthesizer = Synthesizer(self.syn_model_dir.joinpath("taco_pretrained"), low_mem=self.low_mem)
         print("Loaded Synth")
         vocoder.load_model(self.voc_model_fpath)
         print("Loaded Vocoder")
         
         ## Run a test
         print("Testing your configuration with small inputs.")
         # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
         # sampling rate, which may differ.
         # If you're unfamiliar with digital audio, know that it is encoded as an array of floats 
         # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
         # The sampling rate is the number of values (samples) recorded per second, it is set to
         # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond 
         # to an audio of 1 second.
         print("\tTesting the encoder...")
         encoder.embed_utterance(np.zeros(encoder.sampling_rate))
         
         # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
         # returns, but here we're going to make one ourselves just for the sake of showing that it's
         # possible.
         embed = np.random.rand(speaker_embedding_size)
         # Embeddings are L2-normalized (this isn't important here, but if you want to make your own 
         # embeddings it will be).
         embed /= np.linalg.norm(embed)
         # The synthesizer can handle multiple inputs with batching. Let's create another embedding to 
         # illustrate that
         embeds = [embed, np.zeros(speaker_embedding_size)]
         texts = ["test 1", "test 2"]
         print("\tTesting the synthesizer... (loading the model will output a lot of text)")
         mels = self.synthesizer.synthesize_spectrograms(texts, embeds)
         
         # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We 
         # can concatenate the mel spectrograms to a single one.
         mel = np.concatenate(mels, axis=1)
         # The vocoder can take a callback function to display the generation. More on that later. For 
         # now we'll simply hide it like this:
         no_action = lambda *args: None
         print("\tTesting the vocoder...")
         # For the sake of making this test short, we'll pass a short target length. The target length 
         # is the length of the wav segments that are processed in parallel. E.g. for audio sampled 
         # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
         # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and 
         # that has a detrimental effect on the quality of the audio. The default parameters are 
         # recommended in general.
         vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
         
         print("\tAll test passed!")
         
         return("All test passed!")
         
     except Exception as e:
         return("Caught exception: %s" % repr(e))