def generate_voice(self, in_fpath, text, out_fpath): try: preprocessed_wav = encoder.preprocess_wav(in_fpath) original_wav, sampling_rate = librosa.load(in_fpath) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) print("Loaded file successfully") embed = encoder.embed_utterance(preprocessed_wav) print("Created the embedding") texts = [text] embeds = [embed] # If you know what the attention layer alignments are, you can retrieve them here by # passing return_alignments=True specs = self.synthesizer.synthesize_spectrograms(texts, embeds) spec = specs[0] print("Created the mel spectrogram") ## Generating the waveform print("Synthesizing the waveform:") generated_wav = vocoder.infer_waveform(spec) generated_wav = np.pad(generated_wav, (0, self.synthesizer.sample_rate), mode="constant") librosa.output.write_wav(out_fpath, generated_wav.astype(np.float32), self.synthesizer.sample_rate) print("\nSaved output as %s\n\n" % out_fpath) except Exception as e: print("Caught exception: %s" % repr(e)) print("Restarting\n")
def load_model(in_fpath, parser): parser.add_argument("-e", "--enc_model_fpath", type=Path, default="encoder/saved_models/pretrained.pt", help="Path to a saved encoder") parser.add_argument("-s", "--syn_model_dir", type=Path, default="synthesizer/saved_models/logs-pretrained/", help="Directory containing the synthesizer model") parser.add_argument("-v", "--voc_model_fpath", type=Path, default="vocoder/saved_models/pretrained/pretrained.pt", help="Path to a saved vocoder") parser.add_argument("--low_mem", action="store_true", help=\ "If True, the memory used by the synthesizer will be freed after each use. Adds large " "overhead but allows to save some GPU memory for lower-end GPUs.") parser.add_argument("--no_sound", action="store_true", help=\ "If True, audio won't be played.") args = parser.parse_args() encoder.load_model(args.enc_model_fpath) synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem) vocoder.load_model(args.voc_model_fpath) preprocessed_wav = encoder.preprocess_wav(in_fpath) original_wav, sampling_rate = librosa.load(in_fpath) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) embed = encoder.embed_utterance(preprocessed_wav) return synthesizer, sampling_rate, embed
def change_mode(character: str = "Human_Man", tone: str = "neutral"): training_dir = voices_dict[character]['ID'] tone_file = voices_dict[character]['tone'][tone] + '.flac' tone_dir = tone_file.split("-")[1] local_infpath = Path(f'{data_path}/{training_dir}/{tone_dir}/{tone_file}') global in_fpath, filenum, preprocessed_wav, embed, torch, vocoder if local_infpath != in_fpath and character is not None: if tone is None: tone = "neutral" print( f'Reference sound has changed; now loading {character}:{tone}...') with nostdout(): in_fpath = local_infpath preprocessed_wav = encoder.preprocess_wav(in_fpath) original_wav, sampling_rate = librosa.load(str(in_fpath)) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) embed = encoder.embed_utterance(preprocessed_wav) torch.manual_seed(seed) vocoder.load_model(vocoder_path) text_to_speech('Tea.', play_sound=False) else: print('Mode is already correct. No need to change.')
def __init__(self): if (Text2SpeechModel == "dc_tts"): self.g = Graph(mode="synthesize") print("Text2Speech Tensorflow Graph loaded") elif (Text2SpeechModel == "RTVC"): enc_model_fpath = os.path.join( root_file_path, "RTVC", "encoder/saved_models/pretrained.pt") syn_model_dir = os.path.join( root_file_path, "RTVC", "synthesizer/saved_models/logs-pretrained") voc_model_fpath = os.path.join( root_file_path, "RTVC", "vocoder/saved_models/pretrained/pretrained.pt") encoder.load_model(enc_model_fpath) self.synthesizer = Synthesizer(os.path.join( syn_model_dir, "taco_pretrained"), low_mem=False) vocoder.load_model(voc_model_fpath) in_fpath = os.path.join("/", *root_file_path.split("/")[:-1], "REF/refaudioRTVC/ref.wav") preprocessed_wav = encoder.preprocess_wav(in_fpath) original_wav, sampling_rate = librosa.load(in_fpath) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) embed = encoder.embed_utterance(preprocessed_wav) self.embeds = [embed] elif (Text2SpeechModel == "AudioSynth"): taco_pretrained_config_path = os.path.join( root_file_path, 'AudioSynth/TensorFlowTTS/examples/tacotron2/conf/tacotron2.v1.yaml' ) tacotron2_config = AutoConfig.from_pretrained( taco_pretrained_config_path) taco_path = os.path.join(root_file_path, "AudioSynth/tacotron2-120k.h5") self.tacotron2 = TFAutoModel.from_pretrained( config=tacotron2_config, pretrained_path=taco_path, training=False, name="tacotron2") melgan_stft_pretrained_config_path = os.path.join( root_file_path, 'AudioSynth/TensorFlowTTS/examples/melgan.stft/conf/melgan.stft.v1.yaml' ) melgan_stft_config = AutoConfig.from_pretrained( melgan_stft_pretrained_config_path) melgan_stft_path = os.path.join(root_file_path, "AudioSynth/melgan.stft-2M.h5") self.melgan_stft = TFAutoModel.from_pretrained( config=melgan_stft_config, pretrained_path=melgan_stft_path, name="melgan_stft") self.processor = AutoProcessor.from_pretrained( pretrained_path=os.path.join( root_file_path, "AudioSynth/ljspeech_mapper.json")) mels, alignment_history, audios = do_synthesis( "Hello, how can I help you today?", self.tacotron2, self.melgan_stft, "TACOTRON", "MELGAN-STFT", self.processor)
def text_to_speech(text: str, play_sound: bool = True): start = timer() texts = [text] embeds = [embed] print("Creating the MEL spectrogram") with nostdout(): specs = synthesizer.synthesize_spectrograms(texts, embeds) spec = specs[0] print("Generating audio") with nostdout(): generated_wav = vocoder.infer_waveform(spec) generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate)) # Trim excess silences to compensate for gaps in spectrograms (issue #53) generated_wav = encoder.preprocess_wav(generated_wav) filename = f'tts_generated{filenum}.wav' sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate) print(text) if play_sound: os.system(f'afplay {filename} &') elapsed_time = timer() - start print(f'Generated in {elapsed_time}')
def add_real_utterance(self, wav, name, speaker_name): # Compute the mel spectrogram spec = Synthesizer.make_spectrogram(wav) self.ui.draw_spec(spec, "current") # Compute the embedding if not encoder.is_loaded(): self.init_encoder() encoder_wav = encoder.preprocess_wav(wav) embed, partial_embeds, _ = encoder.embed_utterance( encoder_wav, return_partials=True) np.save(self._out_embed_dir.joinpath(name + '.npy'), embed, allow_pickle=False) # save # Add the utterance utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, False) self.utterances.add(utterance) self.ui.register_utterance(utterance) # Plot it self.ui.draw_embed(embed, name, "current") self.ui.draw_umap_projections(self.utterances)
def predict(self, path): # fpath = '/home/ali/Desktop/a2lsv/deneme/' fpaths = glob(path+"/*.wav") embedings = [] embedingsDict = {} for fpath in fpaths: wav = librosa.load(fpath, 16000)[0] encoder_wav = encoder.preprocess_wav(wav) embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True) embed = np.array(embed).reshape(-1) embedings.append(embed) embedingsDict[fpath.split("/")[-1].split(".wav")[0]] = embed pickle.dump(embedingsDict, open(path+"/embedingsDict.pickle", 'wb')) # reducer = TSNE() reducer = umap.UMAP(int(np.ceil(np.sqrt(len(embedings)))), metric="cosine") projections = reducer.fit_transform(embedings) thresh = 1 clusters = hcluster.fclusterdata(projections, thresh, criterion="distance") speakerSlices = {} for fpath, speaker in zip(fpaths, clusters): speaker = str(speaker) audioId = fpath.split('/')[-1].split('.')[0] if speaker not in speakerSlices.keys(): speakerSlices[speaker] = [int(audioId)] else: speakerSlices[speaker] += [int(audioId)] for k, v in speakerSlices.items(): v.sort() return speakerSlices
def vocode(self): speaker_name, spec, breaks, _ = self.current_generated assert spec is not None # Synthesize the waveform if not vocoder.is_loaded(): self.init_vocoder() def vocoder_progress(i, seq_len, b_size, gen_rate): real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000 line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \ % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor) self.ui.log(line, "overwrite") self.ui.set_loading(i, seq_len) if self.ui.current_vocoder_fpath is not None: self.ui.log("") wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress) else: self.ui.log("Waveform generation with Griffin-Lim... ") wav = Synthesizer.griffin_lim(spec) self.ui.set_loading(0) self.ui.log(" Done!", "append") # Add breaks b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size) b_starts = np.concatenate(([0], b_ends[:-1])) wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)] breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks) wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)]) # Play it wav = wav / np.abs(wav).max() * 0.97 self.ui.play(wav, Synthesizer.sample_rate) self.ui.save_button.setDisabled(False) # Compute the embedding # TODO: this is problematic with different sampling rates, gotta fix it if not encoder.is_loaded(): self.init_encoder() encoder_wav = encoder.preprocess_wav(wav) embed, partial_embeds, _ = encoder.embed_utterance( encoder_wav, return_partials=True) # Add the utterance if not speaker_name is None: name = speaker_name else: name = "unknown" name = name + "_gen_%05d" % np.random.randint(100000) utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True) self.utterances.add(utterance) self.ui.register_utterance(utterance) # Plot it self.ui.draw_embed(embed, name, "generated") self.ui.draw_umap_projections(self.utterances)
def preprocess_embeddings(self, path, ext_audio, ext_embed): for i in range(1, len(self._walker)): fileid = self._walker[i] speaker_id, chapter_id, utterance_id = fileid.split("-") fileid_audio = speaker_id + "-" + chapter_id + "-" + utterance_id file_audio = fileid_audio + ext_audio file_audio = os.path.join(path, speaker_id, chapter_id, file_audio) file_embed = fileid_audio + ext_embed file_embed = os.path.join(path, speaker_id, chapter_id, file_embed) # Load audio waveform, sample_rate = torchaudio.load(file_audio) print("Loaded file: ", fileid) # Calculate speaker embedding wav = waveform.transpose(0, 1).detach().numpy().squeeze() preprocessed_wav = styleEncoder.preprocess_wav(wav, sample_rate) embedding = styleEncoder.embed_utterance(preprocessed_wav) # Save embeddings to corresponding csv files data = asarray(embedding) savetxt(file_embed, data, delimiter=',') print("Saved embedding: ", file_embed)
def DeepTalk_encoder(file_path, model_save_path, module_name, preprocess=True, normalize=True, sampling_rate=8000, duration=None): encoder.load_model(model_save_path, module_name=module_name) if (preprocess): wav = Synthesizer.load_preprocess_wav(file_path) ref_audio = encoder.preprocess_wav(wav) else: ref_audio, sr = librosa.load(file_path, sr=sampling_rate) if (duration is not None): ref_audio = ref_audio[0:int(duration * sampling_rate)] embed, partial_embeds, _ = encoder.embed_utterance(ref_audio, using_partials=True, return_partials=True) if (normalize): embed = embed / np.linalg.norm(embed) return embed
def synth(text, audio_file): """ Parameters ---------- text : string text to be said in synthesized voice audio_file : filepath filepath for audio file in wav format Returns ------- generated_wav : numpy.ndarray Numpy padded array of synthesized audio signal """ in_fpath = Path("audio.wav") original_wav, sampling_rate = librosa.load(in_fpath) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) embed = encoder.embed_utterance(preprocessed_wav) print("Synthesizing new audio...") specs = synthesizer.synthesize_spectrograms([text], [embed]) generated_wav = vocoder.infer_waveform(specs[0]) generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant") return generated_wav
def vocode(self): speaker_name, spec, breaks, _ = self.current_generated assert spec is not None # Synthesize the waveform if not vocoder.is_loaded(): self.init_vocoder() def vocoder_progress(i, seq_len, b_size, gen_rate): real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000 line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \ % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor) self.ui.log(line, "overwrite") self.ui.set_loading(i, seq_len) if self.ui.current_vocoder_fpath is not None: self.ui.log("") wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress) else: self.ui.log("Waveform generation with Griffin-Lim... ") wav = Synthesizer.griffin_lim(spec) self.ui.set_loading(0) self.ui.log(" Done!", "append") # Add breaks b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size) b_starts = np.concatenate(([0], b_ends[:-1])) wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)] breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks) wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)]) # Play it wav = wav / np.abs(wav).max() * 0.97 self.ui.play(wav, Synthesizer.sample_rate) fref = '-'.join([self.ui.current_dataset_name, self.ui.current_speaker_name, self.ui.current_utterance_name]) ftime = '{}'.format(int(time.time())) ftext = self.ui.text_prompt.toPlainText() fms = int(len(wav) * 1000 / Synthesizer.sample_rate) fname = filename_formatter('{}_{}_{}ms_{}.wav'.format(fref, ftime, fms, ftext)) audio.save_wav(wav, _out_wav_dir.joinpath(fname), Synthesizer.sample_rate) # save # Compute the embedding # TODO: this is problematic with different sampling rates, gotta fix it if not encoder.is_loaded(): self.init_encoder() encoder_wav = encoder.preprocess_wav(wav) embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True) # Add the utterance name = speaker_name + "_gen_%05d" % int(time.time()) utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True) np.save(_out_embed_dir.joinpath(name + '.npy'), embed, allow_pickle=False) # save self.utterances.add(utterance) # Plot it self.ui.draw_embed(embed, name, "generated") self.ui.draw_umap_projections(self.utterances)
async def create_upload_file(text: str): texts = [text] embeds = [embed] # If you know what the attention layer alignments are, you can retrieve them here by # passing return_alignments=True specs = synthesizer.synthesize_spectrograms(texts, embeds) spec = specs[0] print("Created the mel spectrogram") # Generating the waveform print("Synthesizing the waveform:") # Synthesizing the waveform is fairly straightforward. Remember that the longer the # spectrogram, the more time-efficient the vocoder. generated_wav = vocoder.infer_waveform(spec) # Post-generation # There's a bug with sounddevice that makes the audio cut one second earlier, so we # pad it. generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant") # Trim excess silences to compensate for gaps in spectrograms (issue #53) generated_wav = encoder.preprocess_wav(generated_wav) # Save it on the disk output = BytesIO() wavfile.write(output, synthesizer.sample_rate, generated_wav.astype(np.float32)) return StreamingResponse(output, media_type="audio/x-wav")
def transform_embed(wav, encoder_model_fpath=Path()): from encoder import inference as encoder if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) wav = encoder.preprocess_wav(wav) embed = encoder.embed_utterance(wav) return embed
def generate(): text_to_be_analyzed = request.get_json()["text"] session_client = dialogflow.SessionsClient() session = session_client.session_path(DIALOGFLOW_PROJECT_ID, SESSION_ID) text_input = dialogflow.types.TextInput( text=text_to_be_analyzed, language_code=DIALOGFLOW_LANGUAGE_CODE) query_input = dialogflow.types.QueryInput(text=text_input) try: response = session_client.detect_intent(session=session, query_input=query_input) except InvalidArgument: raise text = response.query_result.fulfillment_text in_fpath = Path("audio.wav") reprocessed_wav = encoder.preprocess_wav(in_fpath) original_wav, sampling_rate = librosa.load(in_fpath) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) embed = encoder.embed_utterance(preprocessed_wav) specs = synthesizer.synthesize_spectrograms([text], [embed]) generated_wav = vocoder.infer_waveform(specs[0]) generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant") generated_wav = encoder.preprocess_wav(generated_wav) if os.path.exists("temp.wav"): os.remove("temp.wav") else: print("The file does not exist") if os.path.exists("temp.mp3"): os.remove("temp.mp3") sf.write("temp.wav", generated_wav, synthesizer.sample_rate) AudioSegment.from_wav("temp.wav").export("temp.mp3", format="mp3") encoded_gen_wav_string = "data:audio/mp3;base64," with open("temp.mp3", "rb") as f1: encoded_f1 = base64.b64encode(f1.read()) encoded_gen_wav_string += str(encoded_f1, 'ascii', 'ignore') # encoded_gen_wav_bytes= base64.b64encode(generated_wav) # encoded_gen_wav_string = str(encoded_gen_wav_bytes,'ascii', 'ignore') res = { "data": encoded_gen_wav_string, "rate": synthesizer.sample_rate, "text": text } # sf.write("demo_output.wav", generated_wav.astype(np.float32), synthesizer.sample_rate) return jsonify(res), 200
def synth(): text = "hey welcome to programming hut" #@param {type:"string"} print("Now recording for 10 seconds, say what you will...") ### record record(30) print("Audio recording complete") in_fpath = Path("audio.wav") reprocessed_wav = encoder.preprocess_wav(in_fpath) original_wav, sampling_rate = librosa.load(in_fpath) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) embed = encoder.embed_utterance(preprocessed_wav) print("Synthesizing new audio...") with io.capture_output() as captured: specs = synthesizer.synthesize_spectrograms([text], [embed]) generated_wav = vocoder.infer_waveform(specs[0]) generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant") display(Audio(generated_wav, rate=synthesizer.sample_rate))
def embed_utterance(fpaths, encoder_model_fpath): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) # Compute the speaker embedding of the utterance wav_fpath = fpaths wav, rate = librosa.load(wav_fpath) wav = encoder.preprocess_wav(wav, rate) return encoder.embed_utterance(wav)
def get_tensor(file_path, preprocess=True, sampling_rate=8000, duration=None): if (preprocess): ref_audio = encoder.preprocess_wav(file_path) else: ref_audio, sr = librosa.load(file_path, sr=sampling_rate) if (duration is not None): ref_audio = ref_audio[0:int(duration * sampling_rate)] return ref_audio
def convert(self, text, in_fpath, outfn): print(f"converting\ntext:\n {text}\n\n wavfn: {in_fpath}") print(f"outfn: {outfn}") try: preprocessed_wav = encoder.preprocess_wav(in_fpath) original_wav, sampling_rate = librosa.load(in_fpath) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) print("Loaded file succesfully") embed = encoder.embed_utterance(preprocessed_wav) print("Created the embedding") # The synthesizer works in batch, so you need to put your data in a list or numpy array texts = [text] embeds = [embed] # If you know what the attention layer alignments are, you can retrieve them here by # passing return_alignments=True specs = self.synthesizer.synthesize_spectrograms(texts, embeds) spec = specs[0] print("Created the mel spectrogram") ## Generating the waveform print("Synthesizing the waveform:") # Synthesizing the waveform is fairly straightforward. Remember that the longer the # spectrogram, the more time-efficient the vocoder. generated_wav = vocoder.infer_waveform(spec) #TODO: check this necessary generated_wav = np.pad(generated_wav, (0, self.synthesizer.sample_rate), mode="constant") #TODO: Save it on the disk? #fpath = "demo_output_%02d.wav" % num_generated print(generated_wav.dtype) librosa.output.write_wav(outfn, generated_wav.astype(np.float32), self.synthesizer.sample_rate) #num_generated += 1 print("\nSaved output as %s\n\n" % outfn) return True except Exception as e: print("Caught exception: %s" % repr(e)) return False
def extract_utterance_feats_spkr(self, data_utterance_path, is_full_ppg=False): """Get PPG and Mel (+ optional F0) for an utterance. Args: data_utterance_path: The path to the data utterance protocol buffer. is_full_ppg: If True, will use the full PPGs. Returns: feat_pairs: A list, each is a [pps, mel, dvec(spkr embedding)] pair. """ utt = Utterance() fs, wav = wavfile.read(data_utterance_path) utt.fs = fs utt.wav = wav utt.ppg = get_ppg(data_utterance_path, self.ppg_deps) audio = torch.FloatTensor(utt.wav.astype(np.float32)) fs = utt.fs if fs != self.stft.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( fs, self.stft.sampling_rate)) audio_norm = audio / self.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) # (1, n_mel_channels, T) acoustic_feats = self.stft.mel_spectrogram(audio_norm) # (n_mel_channels, T) acoustic_feats = torch.squeeze(acoustic_feats, 0) # (T, n_mel_channels) acoustic_feats = acoustic_feats.transpose(0, 1) #print("encoder model path", self.encoder_model_fpath) from encoder import inference as encoder if not encoder.is_loaded(): encoder.load_model(self.encoder_model_fpath) #wav = np.load(data_utterance_path) wav = encoder.preprocess_wav(data_utterance_path) # wav embed = encoder.embed_utterance(wav) #print("spkr embedding", embed) #print("shape of ppg, acoustic feats and spkr embedding", (utt.ppg).shape, acoustic_feats.shape, embed.shape) if is_full_ppg: if self.is_append_f0: ppg_f0 = append_ppg(utt.ppg, utt.f0) return [ppg_f0, acoustic_feats, embed] else: return [utt.ppg, acoustic_feats, embed] else: if self.is_append_f0: ppg_f0 = append_ppg(utt.monophone_ppg, utt.f0) return [ppg_f0, acoustic_feats, embed] else: return [utt.monophone_ppg, acoustic_feats, embed]
def embed_voice(self): encoder.load_model("encoder/saved_models/pretrained.pt") in_fpath = Path(self.voice_file) ## Computing the embedding # First, we load the wav using the function that the speaker encoder provides. This is # important: there is preprocessing that must be applied. # The following two methods are equivalent: # - Directly load from the filepath: preprocessed_wav = encoder.preprocess_wav(in_fpath) # - If the wav is already loaded: original_wav, sampling_rate = librosa.load(in_fpath) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) # Then we derive the embedding. There are many functions and parameters that the # speaker encoder interfaces. These are mostly for in-depth research. You will typically # only use this function (with its default parameters): embed = encoder.embed_utterance(preprocessed_wav) return embed
def embed_utterance(fpaths, encoder_model_fpath): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) # Compute the speaker embedding of the utterance wav_fpath, embed_fpath = fpaths wav = np.load(wav_fpath) wav = encoder.preprocess_wav(wav) embed = encoder.embed_utterance(wav) np.save(embed_fpath, embed, allow_pickle=False)
def get_embed(self, wav): # from encoder import inference as encoder if not encoder.is_loaded(): encoder.load_model(self.encoder_model_fpath, device='cpu') # 用cpu避免以下报错。 # "RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the ‘spawn’ start method" wav = encoder.preprocess_wav(wav) embed = encoder.embed_utterance(wav) return embed
def get_spk_embed(load_path, enc_model_fpath): file_name = load_path.split('/')[-1] wav = load_wav(load_path) encoder.load_model(enc_model_fpath) preprocessed_wav = encoder.preprocess_wav(load_path) embed = encoder.embed_utterance(preprocessed_wav) spk_embd = torch.tensor(embed).unsqueeze(0) return spk_embd, file_name
def embed_utterance(fpaths, encoder_model_fpath): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) # Compute the speaker embedding of the utterance wav_fpath = embed_fpath = fpaths embed_fpath = embed_fpath.replace(".wav", ".npy") wav, rate = librosa.load(wav_fpath) wav = encoder.preprocess_wav(wav, rate) embed = encoder.embed_utterance(wav) np.save(embed_fpath, embed, allow_pickle=False)
async def generate_wav(text, filename): user_id = "russell" embed_path = "user_data/embeds/{}.npy".format(user_id) embed_path = Path(embed_path) if embed_path.is_file(): embed = np.load(embed_path) print("load embedding in {}".format(embed_path)) else: raise ("user embedding not found") # ================== synthesizer ================== start_time = time.time() # The synthesizer works in batch, so you need to put your data in a list or numpy array texts = [text] embeds = [embed] # If you know what the attention layer alignments are, you can retrieve them here by # passing return_alignments=True specs = synthesizer.synthesize_spectrograms(texts, embeds) spec = specs[0] print("Created the mel spectrogram") print("--- synthesizer: %s seconds ---" % (time.time() - start_time)) # ================== vocoder ================== start_time = time.time() # If seed is specified, reset torch seed and reload vocoder if args.seed is not None: torch.manual_seed(args.seed) vocoder.load_model(args.voc_model_fpath) # Synthesizing the waveform is fairly straightforward. Remember that the longer the # spectrogram, the more time-efficient the vocoder. generated_wav = vocoder.infer_waveform(spec) print("") print("--- vocoder: %s seconds ---" % (time.time() - start_time)) # ================== post generation ================== start_time = time.time() # There's a bug with sounddevice that makes the audio cut one second earlier, so we # pad it. generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant") # Trim excess silences to compensate for gaps in spectrograms (issue #53) generated_wav = encoder.preprocess_wav(generated_wav) print("--- post generation: %s seconds ---" % (time.time() - start_time)) sf.write("./user_data/generated_voice/%s/"%(user_id) + "%s.wav"%filename, \ generated_wav.astype(np.float32), synthesizer.sample_rate)
def getWav(self,referenceVoiceWavPath,words): print("getWav1") preprocessed_wav = encoder.preprocess_wav(referenceVoiceWavPath) embed = encoder.computeEmbedding(preprocessed_wav) print("getWav2") embeds=[embed] specs = self.synthesizer.synthesize_spectrograms(words, embeds) spec = specs[0] generated_wav = vocoder.infer_waveform(spec) generated_wav = np.pad(generated_wav, (0, self.synthesizer.sample_rate), mode="constant") generated_wav=trim_long_silences(generated_wav) return generated_wav,self.synthesizer.sample_rate
def compute_embedding(self, spk_file): in_fpath = spk_file ## Computing the embedding # First, we load the wav using the function that the speaker encoder provides. This is # important: there is preprocessing that must be applied. # The following two methods are equivalent: # - Directly load from the filepath: preprocessed_wav = encoder.preprocess_wav(in_fpath) # - If the wav is already loaded: original_wav, sampling_rate = librosa.load(in_fpath) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) print("Loaded file succesfully") # Then we derive the embedding. There are many functions and parameters that the # speaker encoder interfaces. These are mostly for in-depth research. You will typically # only use this function (with its default parameters): embed = encoder.embed_utterance(preprocessed_wav) print("Created the embedding\n") return embed
def embed_utterance(src, skip_existing=True, encoder_model_fpath=Path()): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) wav_fpath, embed_fpath = src if skip_existing and embed_fpath.is_file(): return wav = aukit.load_wav(wav_fpath, sr=hp.sampling_rate) wav = encoder.preprocess_wav(wav) embed = encoder.embed_utterance(wav) np.save(embed_fpath, embed, allow_pickle=False)
def run_voiceCloning(filename): in_fpath = dataPath + "/" + filename #transforming mp3 into wav subprocess.call(['ffmpeg', '-i', in_fpath + '.mp3', in_fpath + '.wav']) time.sleep(5) #running the encoder on the audio input original_wav, sampling_rate = librosa.load(Path(in_fpath + '.wav')) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) #getting the embeds from the encoder embed = encoder.embed_utterance(preprocessed_wav) return audioFromEmbeds(filename, embed)