Esempio n. 1
0
    def generate_voice(self, in_fpath, text, out_fpath):
        try:
            preprocessed_wav = encoder.preprocess_wav(in_fpath)
            original_wav, sampling_rate = librosa.load(in_fpath)
            preprocessed_wav = encoder.preprocess_wav(original_wav,
                                                      sampling_rate)
            print("Loaded file successfully")

            embed = encoder.embed_utterance(preprocessed_wav)
            print("Created the embedding")

            texts = [text]
            embeds = [embed]
            # If you know what the attention layer alignments are, you can retrieve them here by
            # passing return_alignments=True
            specs = self.synthesizer.synthesize_spectrograms(texts, embeds)
            spec = specs[0]
            print("Created the mel spectrogram")
            ## Generating the waveform
            print("Synthesizing the waveform:")
            generated_wav = vocoder.infer_waveform(spec)

            generated_wav = np.pad(generated_wav,
                                   (0, self.synthesizer.sample_rate),
                                   mode="constant")
            librosa.output.write_wav(out_fpath,
                                     generated_wav.astype(np.float32),
                                     self.synthesizer.sample_rate)
            print("\nSaved output as %s\n\n" % out_fpath)

        except Exception as e:
            print("Caught exception: %s" % repr(e))
            print("Restarting\n")
Esempio n. 2
0
def load_model(in_fpath, parser):

	parser.add_argument("-e", "--enc_model_fpath", type=Path, 
		        default="encoder/saved_models/pretrained.pt",
		        help="Path to a saved encoder")
	parser.add_argument("-s", "--syn_model_dir", type=Path, 
		        default="synthesizer/saved_models/logs-pretrained/",
		        help="Directory containing the synthesizer model")
	parser.add_argument("-v", "--voc_model_fpath", type=Path, 
		        default="vocoder/saved_models/pretrained/pretrained.pt",
		        help="Path to a saved vocoder")
	parser.add_argument("--low_mem", action="store_true", help=\
	"If True, the memory used by the synthesizer will be freed after each use. Adds large "
	"overhead but allows to save some GPU memory for lower-end GPUs.")
	parser.add_argument("--no_sound", action="store_true", help=\
	"If True, audio won't be played.")
	args = parser.parse_args()
	encoder.load_model(args.enc_model_fpath)
	synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem)
	vocoder.load_model(args.voc_model_fpath)

	preprocessed_wav = encoder.preprocess_wav(in_fpath)
	original_wav, sampling_rate = librosa.load(in_fpath)
	preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
	embed = encoder.embed_utterance(preprocessed_wav)
	
	return synthesizer, sampling_rate, embed
def change_mode(character: str = "Human_Man", tone: str = "neutral"):

    training_dir = voices_dict[character]['ID']
    tone_file = voices_dict[character]['tone'][tone] + '.flac'
    tone_dir = tone_file.split("-")[1]
    local_infpath = Path(f'{data_path}/{training_dir}/{tone_dir}/{tone_file}')

    global in_fpath, filenum, preprocessed_wav, embed, torch, vocoder
    if local_infpath != in_fpath and character is not None:
        if tone is None:
            tone = "neutral"

        print(
            f'Reference sound has changed; now loading {character}:{tone}...')
        with nostdout():
            in_fpath = local_infpath

            preprocessed_wav = encoder.preprocess_wav(in_fpath)
            original_wav, sampling_rate = librosa.load(str(in_fpath))
            preprocessed_wav = encoder.preprocess_wav(original_wav,
                                                      sampling_rate)

            embed = encoder.embed_utterance(preprocessed_wav)
            torch.manual_seed(seed)
            vocoder.load_model(vocoder_path)
            text_to_speech('Tea.', play_sound=False)
    else:
        print('Mode is already correct. No need to change.')
Esempio n. 4
0
    def __init__(self):
        if (Text2SpeechModel == "dc_tts"):
            self.g = Graph(mode="synthesize")
            print("Text2Speech Tensorflow Graph loaded")
        elif (Text2SpeechModel == "RTVC"):
            enc_model_fpath = os.path.join(
                root_file_path, "RTVC", "encoder/saved_models/pretrained.pt")
            syn_model_dir = os.path.join(
                root_file_path, "RTVC",
                "synthesizer/saved_models/logs-pretrained")
            voc_model_fpath = os.path.join(
                root_file_path, "RTVC",
                "vocoder/saved_models/pretrained/pretrained.pt")
            encoder.load_model(enc_model_fpath)
            self.synthesizer = Synthesizer(os.path.join(
                syn_model_dir, "taco_pretrained"),
                                           low_mem=False)
            vocoder.load_model(voc_model_fpath)
            in_fpath = os.path.join("/",
                                    *root_file_path.split("/")[:-1],
                                    "REF/refaudioRTVC/ref.wav")
            preprocessed_wav = encoder.preprocess_wav(in_fpath)
            original_wav, sampling_rate = librosa.load(in_fpath)
            preprocessed_wav = encoder.preprocess_wav(original_wav,
                                                      sampling_rate)
            embed = encoder.embed_utterance(preprocessed_wav)
            self.embeds = [embed]
        elif (Text2SpeechModel == "AudioSynth"):
            taco_pretrained_config_path = os.path.join(
                root_file_path,
                'AudioSynth/TensorFlowTTS/examples/tacotron2/conf/tacotron2.v1.yaml'
            )
            tacotron2_config = AutoConfig.from_pretrained(
                taco_pretrained_config_path)
            taco_path = os.path.join(root_file_path,
                                     "AudioSynth/tacotron2-120k.h5")
            self.tacotron2 = TFAutoModel.from_pretrained(
                config=tacotron2_config,
                pretrained_path=taco_path,
                training=False,
                name="tacotron2")

            melgan_stft_pretrained_config_path = os.path.join(
                root_file_path,
                'AudioSynth/TensorFlowTTS/examples/melgan.stft/conf/melgan.stft.v1.yaml'
            )
            melgan_stft_config = AutoConfig.from_pretrained(
                melgan_stft_pretrained_config_path)
            melgan_stft_path = os.path.join(root_file_path,
                                            "AudioSynth/melgan.stft-2M.h5")
            self.melgan_stft = TFAutoModel.from_pretrained(
                config=melgan_stft_config,
                pretrained_path=melgan_stft_path,
                name="melgan_stft")
            self.processor = AutoProcessor.from_pretrained(
                pretrained_path=os.path.join(
                    root_file_path, "AudioSynth/ljspeech_mapper.json"))
            mels, alignment_history, audios = do_synthesis(
                "Hello, how can I help you today?", self.tacotron2,
                self.melgan_stft, "TACOTRON", "MELGAN-STFT", self.processor)
def text_to_speech(text: str, play_sound: bool = True):
    start = timer()
    texts = [text]
    embeds = [embed]
    print("Creating the MEL spectrogram")
    with nostdout():
        specs = synthesizer.synthesize_spectrograms(texts, embeds)
        spec = specs[0]

    print("Generating audio")

    with nostdout():
        generated_wav = vocoder.infer_waveform(spec)
        generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate))

        # Trim excess silences to compensate for gaps in spectrograms (issue #53)
        generated_wav = encoder.preprocess_wav(generated_wav)

    filename = f'tts_generated{filenum}.wav'
    sf.write(filename, generated_wav.astype(np.float32),
             synthesizer.sample_rate)

    print(text)
    if play_sound:
        os.system(f'afplay {filename} &')
    elapsed_time = timer() - start
    print(f'Generated in {elapsed_time}')
Esempio n. 6
0
    def add_real_utterance(self, wav, name, speaker_name):
        # Compute the mel spectrogram
        spec = Synthesizer.make_spectrogram(wav)
        self.ui.draw_spec(spec, "current")

        # Compute the embedding
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(
            encoder_wav, return_partials=True)

        np.save(self._out_embed_dir.joinpath(name + '.npy'),
                embed,
                allow_pickle=False)  # save

        # Add the utterance
        utterance = Utterance(name, speaker_name, wav, spec, embed,
                              partial_embeds, False)
        self.utterances.add(utterance)
        self.ui.register_utterance(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "current")
        self.ui.draw_umap_projections(self.utterances)
Esempio n. 7
0
    def predict(self, path):
        # fpath = '/home/ali/Desktop/a2lsv/deneme/'
        fpaths = glob(path+"/*.wav")
        embedings = []
        embedingsDict = {}
        for fpath in fpaths:
            wav = librosa.load(fpath, 16000)[0]
            encoder_wav = encoder.preprocess_wav(wav)
            embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
            embed = np.array(embed).reshape(-1)
            embedings.append(embed)
            embedingsDict[fpath.split("/")[-1].split(".wav")[0]] = embed

        pickle.dump(embedingsDict, open(path+"/embedingsDict.pickle", 'wb'))

        # reducer = TSNE()
        reducer = umap.UMAP(int(np.ceil(np.sqrt(len(embedings)))), metric="cosine")
        projections = reducer.fit_transform(embedings)
        
        thresh = 1
        clusters = hcluster.fclusterdata(projections, thresh, criterion="distance")

        speakerSlices = {}
        for fpath, speaker  in zip(fpaths, clusters):
            speaker = str(speaker)
            audioId = fpath.split('/')[-1].split('.')[0]
            if speaker not in speakerSlices.keys():
                speakerSlices[speaker] = [int(audioId)]
            else:
                speakerSlices[speaker] += [int(audioId)]
        for k, v in speakerSlices.items():
            v.sort()
        return speakerSlices
Esempio n. 8
0
    def vocode(self):
        speaker_name, spec, breaks, _ = self.current_generated
        assert spec is not None

        # Synthesize the waveform
        if not vocoder.is_loaded():
            self.init_vocoder()

        def vocoder_progress(i, seq_len, b_size, gen_rate):
            real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
            line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
                   % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
            self.ui.log(line, "overwrite")
            self.ui.set_loading(i, seq_len)

        if self.ui.current_vocoder_fpath is not None:
            self.ui.log("")
            wav = vocoder.infer_waveform(spec,
                                         progress_callback=vocoder_progress)
        else:
            self.ui.log("Waveform generation with Griffin-Lim... ")
            wav = Synthesizer.griffin_lim(spec)
        self.ui.set_loading(0)
        self.ui.log(" Done!", "append")

        # Add breaks
        b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
        b_starts = np.concatenate(([0], b_ends[:-1]))
        wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
        breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
        wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])

        # Play it
        wav = wav / np.abs(wav).max() * 0.97
        self.ui.play(wav, Synthesizer.sample_rate)

        self.ui.save_button.setDisabled(False)

        # Compute the embedding
        # TODO: this is problematic with different sampling rates, gotta fix it
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(
            encoder_wav, return_partials=True)

        # Add the utterance
        if not speaker_name is None:
            name = speaker_name
        else:
            name = "unknown"
        name = name + "_gen_%05d" % np.random.randint(100000)
        utterance = Utterance(name, speaker_name, wav, spec, embed,
                              partial_embeds, True)
        self.utterances.add(utterance)
        self.ui.register_utterance(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "generated")
        self.ui.draw_umap_projections(self.utterances)
    def preprocess_embeddings(self, path, ext_audio, ext_embed):
        for i in range(1, len(self._walker)):
            fileid = self._walker[i]

            speaker_id, chapter_id, utterance_id = fileid.split("-")

            fileid_audio = speaker_id + "-" + chapter_id + "-" + utterance_id
            file_audio = fileid_audio + ext_audio
            file_audio = os.path.join(path, speaker_id, chapter_id, file_audio)
            file_embed = fileid_audio + ext_embed
            file_embed = os.path.join(path, speaker_id, chapter_id, file_embed)

            # Load audio
            waveform, sample_rate = torchaudio.load(file_audio)

            print("Loaded file: ", fileid)

            # Calculate speaker embedding
            wav = waveform.transpose(0, 1).detach().numpy().squeeze()
            preprocessed_wav = styleEncoder.preprocess_wav(wav, sample_rate)
            embedding = styleEncoder.embed_utterance(preprocessed_wav)

            # Save embeddings to corresponding csv files
            data = asarray(embedding)
            savetxt(file_embed, data, delimiter=',')

            print("Saved embedding: ", file_embed)
Esempio n. 10
0
def DeepTalk_encoder(file_path,
                     model_save_path,
                     module_name,
                     preprocess=True,
                     normalize=True,
                     sampling_rate=8000,
                     duration=None):

    encoder.load_model(model_save_path, module_name=module_name)

    if (preprocess):
        wav = Synthesizer.load_preprocess_wav(file_path)
        ref_audio = encoder.preprocess_wav(wav)
    else:
        ref_audio, sr = librosa.load(file_path, sr=sampling_rate)

    if (duration is not None):
        ref_audio = ref_audio[0:int(duration * sampling_rate)]

    embed, partial_embeds, _ = encoder.embed_utterance(ref_audio,
                                                       using_partials=True,
                                                       return_partials=True)

    if (normalize):
        embed = embed / np.linalg.norm(embed)

    return embed
Esempio n. 11
0
def synth(text, audio_file):
    """

    Parameters
    ----------
    text : string
        text to be said in synthesized voice
    audio_file : filepath
        filepath for audio file in wav format

    Returns
    -------
    generated_wav : numpy.ndarray
        Numpy padded array of synthesized audio signal

    """

    in_fpath = Path("audio.wav")
    original_wav, sampling_rate = librosa.load(in_fpath)
    preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
    embed = encoder.embed_utterance(preprocessed_wav)
    print("Synthesizing new audio...")
    specs = synthesizer.synthesize_spectrograms([text], [embed])
    generated_wav = vocoder.infer_waveform(specs[0])
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                           mode="constant")

    return generated_wav
Esempio n. 12
0
    def vocode(self):
        speaker_name, spec, breaks, _ = self.current_generated
        assert spec is not None

        # Synthesize the waveform
        if not vocoder.is_loaded():
            self.init_vocoder()

        def vocoder_progress(i, seq_len, b_size, gen_rate):
            real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
            line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
                   % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
            self.ui.log(line, "overwrite")
            self.ui.set_loading(i, seq_len)

        if self.ui.current_vocoder_fpath is not None:
            self.ui.log("")
            wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress)
        else:
            self.ui.log("Waveform generation with Griffin-Lim... ")
            wav = Synthesizer.griffin_lim(spec)
        self.ui.set_loading(0)
        self.ui.log(" Done!", "append")

        # Add breaks
        b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
        b_starts = np.concatenate(([0], b_ends[:-1]))
        wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
        breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
        wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])

        # Play it
        wav = wav / np.abs(wav).max() * 0.97
        self.ui.play(wav, Synthesizer.sample_rate)

        fref = '-'.join([self.ui.current_dataset_name, self.ui.current_speaker_name, self.ui.current_utterance_name])
        ftime = '{}'.format(int(time.time()))
        ftext = self.ui.text_prompt.toPlainText()
        fms = int(len(wav) * 1000 / Synthesizer.sample_rate)
        fname = filename_formatter('{}_{}_{}ms_{}.wav'.format(fref, ftime, fms, ftext))
        audio.save_wav(wav, _out_wav_dir.joinpath(fname), Synthesizer.sample_rate)  # save

        # Compute the embedding
        # TODO: this is problematic with different sampling rates, gotta fix it
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)

        # Add the utterance
        name = speaker_name + "_gen_%05d" % int(time.time())
        utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True)

        np.save(_out_embed_dir.joinpath(name + '.npy'), embed, allow_pickle=False)  # save

        self.utterances.add(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "generated")
        self.ui.draw_umap_projections(self.utterances)
Esempio n. 13
0
async def create_upload_file(text: str):
    texts = [text]
    embeds = [embed]
    # If you know what the attention layer alignments are, you can retrieve them here by
    # passing return_alignments=True
    specs = synthesizer.synthesize_spectrograms(texts, embeds)
    spec = specs[0]
    print("Created the mel spectrogram")

    # Generating the waveform
    print("Synthesizing the waveform:")

    # Synthesizing the waveform is fairly straightforward. Remember that the longer the
    # spectrogram, the more time-efficient the vocoder.
    generated_wav = vocoder.infer_waveform(spec)

    # Post-generation
    # There's a bug with sounddevice that makes the audio cut one second earlier, so we
    # pad it.
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                           mode="constant")

    # Trim excess silences to compensate for gaps in spectrograms (issue #53)
    generated_wav = encoder.preprocess_wav(generated_wav)

    # Save it on the disk
    output = BytesIO()
    wavfile.write(output, synthesizer.sample_rate,
                  generated_wav.astype(np.float32))
    return StreamingResponse(output, media_type="audio/x-wav")
Esempio n. 14
0
def transform_embed(wav, encoder_model_fpath=Path()):
    from encoder import inference as encoder
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    wav = encoder.preprocess_wav(wav)
    embed = encoder.embed_utterance(wav)
    return embed
Esempio n. 15
0
def generate():
    text_to_be_analyzed = request.get_json()["text"]
    session_client = dialogflow.SessionsClient()
    session = session_client.session_path(DIALOGFLOW_PROJECT_ID, SESSION_ID)
    text_input = dialogflow.types.TextInput(
        text=text_to_be_analyzed, language_code=DIALOGFLOW_LANGUAGE_CODE)
    query_input = dialogflow.types.QueryInput(text=text_input)
    try:
        response = session_client.detect_intent(session=session,
                                                query_input=query_input)
    except InvalidArgument:
        raise
    text = response.query_result.fulfillment_text
    in_fpath = Path("audio.wav")
    reprocessed_wav = encoder.preprocess_wav(in_fpath)
    original_wav, sampling_rate = librosa.load(in_fpath)
    preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
    embed = encoder.embed_utterance(preprocessed_wav)
    specs = synthesizer.synthesize_spectrograms([text], [embed])
    generated_wav = vocoder.infer_waveform(specs[0])
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                           mode="constant")
    generated_wav = encoder.preprocess_wav(generated_wav)
    if os.path.exists("temp.wav"):
        os.remove("temp.wav")
    else:
        print("The file does not exist")
    if os.path.exists("temp.mp3"):
        os.remove("temp.mp3")
    sf.write("temp.wav", generated_wav, synthesizer.sample_rate)
    AudioSegment.from_wav("temp.wav").export("temp.mp3", format="mp3")
    encoded_gen_wav_string = "data:audio/mp3;base64,"
    with open("temp.mp3", "rb") as f1:
        encoded_f1 = base64.b64encode(f1.read())
        encoded_gen_wav_string += str(encoded_f1, 'ascii', 'ignore')

    # encoded_gen_wav_bytes= base64.b64encode(generated_wav)
    # encoded_gen_wav_string = str(encoded_gen_wav_bytes,'ascii', 'ignore')

    res = {
        "data": encoded_gen_wav_string,
        "rate": synthesizer.sample_rate,
        "text": text
    }
    # sf.write("demo_output.wav", generated_wav.astype(np.float32), synthesizer.sample_rate)
    return jsonify(res), 200
Esempio n. 16
0
def synth():
  text = "hey welcome to programming hut" #@param {type:"string"}
  print("Now recording for 10 seconds, say what you will...")
  
  ### record
  record(30)
  print("Audio recording complete")
  in_fpath = Path("audio.wav")
  reprocessed_wav = encoder.preprocess_wav(in_fpath)
  original_wav, sampling_rate = librosa.load(in_fpath)
  preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
  embed = encoder.embed_utterance(preprocessed_wav)
  print("Synthesizing new audio...")
  with io.capture_output() as captured:
    specs = synthesizer.synthesize_spectrograms([text], [embed])
  generated_wav = vocoder.infer_waveform(specs[0])
  generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
  display(Audio(generated_wav, rate=synthesizer.sample_rate))
Esempio n. 17
0
def embed_utterance(fpaths, encoder_model_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    # Compute the speaker embedding of the utterance
    wav_fpath = fpaths
    wav, rate = librosa.load(wav_fpath)
    wav = encoder.preprocess_wav(wav, rate)
    return encoder.embed_utterance(wav)
Esempio n. 18
0
def get_tensor(file_path, preprocess=True, sampling_rate=8000, duration=None):
    if (preprocess):
        ref_audio = encoder.preprocess_wav(file_path)
    else:
        ref_audio, sr = librosa.load(file_path, sr=sampling_rate)

    if (duration is not None):
        ref_audio = ref_audio[0:int(duration * sampling_rate)]
    return ref_audio
Esempio n. 19
0
    def convert(self, text, in_fpath, outfn):
        print(f"converting\ntext:\n {text}\n\n wavfn: {in_fpath}")
        print(f"outfn: {outfn}")

        try:
            preprocessed_wav = encoder.preprocess_wav(in_fpath)
            original_wav, sampling_rate = librosa.load(in_fpath)
            preprocessed_wav = encoder.preprocess_wav(original_wav,
                                                      sampling_rate)
            print("Loaded file succesfully")

            embed = encoder.embed_utterance(preprocessed_wav)
            print("Created the embedding")
            # The synthesizer works in batch, so you need to put your data in a list or numpy array
            texts = [text]
            embeds = [embed]
            # If you know what the attention layer alignments are, you can retrieve them here by
            # passing return_alignments=True
            specs = self.synthesizer.synthesize_spectrograms(texts, embeds)
            spec = specs[0]
            print("Created the mel spectrogram")

            ## Generating the waveform
            print("Synthesizing the waveform:")
            # Synthesizing the waveform is fairly straightforward. Remember that the longer the
            # spectrogram, the more time-efficient the vocoder.
            generated_wav = vocoder.infer_waveform(spec)

            #TODO: check this necessary
            generated_wav = np.pad(generated_wav,
                                   (0, self.synthesizer.sample_rate),
                                   mode="constant")

            #TODO: Save it on the disk?
            #fpath = "demo_output_%02d.wav" % num_generated
            print(generated_wav.dtype)
            librosa.output.write_wav(outfn, generated_wav.astype(np.float32),
                                     self.synthesizer.sample_rate)
            #num_generated += 1
            print("\nSaved output as %s\n\n" % outfn)
            return True
        except Exception as e:
            print("Caught exception: %s" % repr(e))
            return False
    def extract_utterance_feats_spkr(self, data_utterance_path, is_full_ppg=False):
        """Get PPG and Mel (+ optional F0) for an utterance.

        Args:
            data_utterance_path: The path to the data utterance protocol buffer.
            is_full_ppg: If True, will use the full PPGs.

        Returns:
            feat_pairs: A list, each is a [pps, mel, dvec(spkr embedding)] pair.
        """
        utt = Utterance()
        fs, wav = wavfile.read(data_utterance_path)
        utt.fs = fs
        utt.wav = wav
        utt.ppg = get_ppg(data_utterance_path, self.ppg_deps)

        audio = torch.FloatTensor(utt.wav.astype(np.float32))
        fs = utt.fs

        if fs != self.stft.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                fs, self.stft.sampling_rate))
        audio_norm = audio / self.max_wav_value
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        # (1, n_mel_channels, T)
        acoustic_feats = self.stft.mel_spectrogram(audio_norm)
        # (n_mel_channels, T)
        acoustic_feats = torch.squeeze(acoustic_feats, 0)
        # (T, n_mel_channels)
        acoustic_feats = acoustic_feats.transpose(0, 1)
        
        #print("encoder model path", self.encoder_model_fpath)
        
        from encoder import inference as encoder
        if not encoder.is_loaded():
            encoder.load_model(self.encoder_model_fpath)
        
        #wav = np.load(data_utterance_path)
        wav = encoder.preprocess_wav(data_utterance_path) # wav
        embed = encoder.embed_utterance(wav)
        #print("spkr embedding", embed)
        #print("shape of ppg, acoustic feats and spkr embedding", (utt.ppg).shape, acoustic_feats.shape, embed.shape)
        
        if is_full_ppg:
            if self.is_append_f0:
                ppg_f0 = append_ppg(utt.ppg, utt.f0)
                return [ppg_f0, acoustic_feats, embed]
            else:
                return [utt.ppg, acoustic_feats, embed]
        else:
            if self.is_append_f0:
                ppg_f0 = append_ppg(utt.monophone_ppg, utt.f0)
                return [ppg_f0, acoustic_feats, embed]
            else:
                return [utt.monophone_ppg, acoustic_feats, embed]
Esempio n. 21
0
    def embed_voice(self):
        encoder.load_model("encoder/saved_models/pretrained.pt")
        in_fpath = Path(self.voice_file)

        ## Computing the embedding
        # First, we load the wav using the function that the speaker encoder provides. This is
        # important: there is preprocessing that must be applied.
        # The following two methods are equivalent:
        # - Directly load from the filepath:
        preprocessed_wav = encoder.preprocess_wav(in_fpath)
        # - If the wav is already loaded:
        original_wav, sampling_rate = librosa.load(in_fpath)
        preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)

        # Then we derive the embedding. There are many functions and parameters that the
        # speaker encoder interfaces. These are mostly for in-depth research. You will typically
        # only use this function (with its default parameters):
        embed = encoder.embed_utterance(preprocessed_wav)
        return embed
Esempio n. 22
0
def embed_utterance(fpaths, encoder_model_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    # Compute the speaker embedding of the utterance
    wav_fpath, embed_fpath = fpaths
    wav = np.load(wav_fpath)
    wav = encoder.preprocess_wav(wav)
    embed = encoder.embed_utterance(wav)
    np.save(embed_fpath, embed, allow_pickle=False)
Esempio n. 23
0
    def get_embed(self, wav):
        # from encoder import inference as encoder
        if not encoder.is_loaded():
            encoder.load_model(self.encoder_model_fpath, device='cpu')
            # 用cpu避免以下报错。
            # "RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the ‘spawn’ start method"

        wav = encoder.preprocess_wav(wav)
        embed = encoder.embed_utterance(wav)
        return embed
Esempio n. 24
0
def get_spk_embed(load_path, enc_model_fpath):

    file_name = load_path.split('/')[-1]
    wav = load_wav(load_path)
    encoder.load_model(enc_model_fpath)
    preprocessed_wav = encoder.preprocess_wav(load_path)
    embed = encoder.embed_utterance(preprocessed_wav)
    spk_embd = torch.tensor(embed).unsqueeze(0)

    return spk_embd, file_name
Esempio n. 25
0
def embed_utterance(fpaths, encoder_model_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    # Compute the speaker embedding of the utterance
    wav_fpath = embed_fpath = fpaths
    embed_fpath = embed_fpath.replace(".wav", ".npy")
    wav, rate = librosa.load(wav_fpath)
    wav = encoder.preprocess_wav(wav, rate)
    embed = encoder.embed_utterance(wav)
    np.save(embed_fpath, embed, allow_pickle=False)
Esempio n. 26
0
async def generate_wav(text, filename):
    user_id = "russell"
    embed_path = "user_data/embeds/{}.npy".format(user_id)
    embed_path = Path(embed_path)

    if embed_path.is_file():
        embed = np.load(embed_path)
        print("load embedding in {}".format(embed_path))
    else:
        raise ("user embedding not found")

    # ================== synthesizer ==================
    start_time = time.time()

    # The synthesizer works in batch, so you need to put your data in a list or numpy array
    texts = [text]
    embeds = [embed]
    # If you know what the attention layer alignments are, you can retrieve them here by
    # passing return_alignments=True
    specs = synthesizer.synthesize_spectrograms(texts, embeds)
    spec = specs[0]
    print("Created the mel spectrogram")

    print("--- synthesizer: %s seconds ---" % (time.time() - start_time))

    # ================== vocoder ==================
    start_time = time.time()

    # If seed is specified, reset torch seed and reload vocoder
    if args.seed is not None:
        torch.manual_seed(args.seed)
        vocoder.load_model(args.voc_model_fpath)

    # Synthesizing the waveform is fairly straightforward. Remember that the longer the
    # spectrogram, the more time-efficient the vocoder.
    generated_wav = vocoder.infer_waveform(spec)
    print("")
    print("--- vocoder: %s seconds ---" % (time.time() - start_time))

    # ================== post generation ==================
    start_time = time.time()

    # There's a bug with sounddevice that makes the audio cut one second earlier, so we
    # pad it.
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                           mode="constant")

    # Trim excess silences to compensate for gaps in spectrograms (issue #53)
    generated_wav = encoder.preprocess_wav(generated_wav)
    print("--- post generation: %s seconds ---" % (time.time() - start_time))

    sf.write("./user_data/generated_voice/%s/"%(user_id) + "%s.wav"%filename, \
            generated_wav.astype(np.float32), synthesizer.sample_rate)
Esempio n. 27
0
 def getWav(self,referenceVoiceWavPath,words):
     print("getWav1")
     preprocessed_wav = encoder.preprocess_wav(referenceVoiceWavPath)
     embed = encoder.computeEmbedding(preprocessed_wav)
     print("getWav2")
     embeds=[embed]
     specs = self.synthesizer.synthesize_spectrograms(words, embeds)
     spec = specs[0]
     generated_wav = vocoder.infer_waveform(spec)
     generated_wav = np.pad(generated_wav, (0, self.synthesizer.sample_rate), mode="constant")
     generated_wav=trim_long_silences(generated_wav)
     return generated_wav,self.synthesizer.sample_rate
Esempio n. 28
0
 def compute_embedding(self, spk_file):
     in_fpath = spk_file
                 
     ## Computing the embedding
     # First, we load the wav using the function that the speaker encoder provides. This is 
     # important: there is preprocessing that must be applied.
     
     # The following two methods are equivalent:
     # - Directly load from the filepath:
     preprocessed_wav = encoder.preprocess_wav(in_fpath)
     # - If the wav is already loaded:
     original_wav, sampling_rate = librosa.load(in_fpath)
     preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
     print("Loaded file succesfully")
     
     # Then we derive the embedding. There are many functions and parameters that the 
     # speaker encoder interfaces. These are mostly for in-depth research. You will typically
     # only use this function (with its default parameters):
     embed = encoder.embed_utterance(preprocessed_wav)
     print("Created the embedding\n")
     
     return embed
Esempio n. 29
0
def embed_utterance(src, skip_existing=True, encoder_model_fpath=Path()):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    wav_fpath, embed_fpath = src

    if skip_existing and embed_fpath.is_file():
        return

    wav = aukit.load_wav(wav_fpath, sr=hp.sampling_rate)
    wav = encoder.preprocess_wav(wav)
    embed = encoder.embed_utterance(wav)
    np.save(embed_fpath, embed, allow_pickle=False)
Esempio n. 30
0
def run_voiceCloning(filename):
    in_fpath = dataPath + "/" + filename

    #transforming mp3 into wav
    subprocess.call(['ffmpeg', '-i', in_fpath + '.mp3', in_fpath + '.wav'])
    time.sleep(5)
    #running the encoder on the audio input
    original_wav, sampling_rate = librosa.load(Path(in_fpath + '.wav'))
    preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
    #getting the embeds from the encoder
    embed = encoder.embed_utterance(preprocessed_wav)

    return audioFromEmbeds(filename, embed)