def synthesize(self, text, mel_targets=None, reference_mel=None): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32), } if mel_targets is not None: mel_targets = np.expand_dims(mel_targets, 0) feed_dict.update({ self.model.mel_targets: np.asarray(mel_targets, dtype=np.float32) }) if reference_mel is not None: reference_mel = np.expand_dims(reference_mel, 0) feed_dict.update({ self.model.reference_mel: np.asarray(reference_mel, dtype=np.float32) }) wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def synthesize(self, text): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] print ('***cleaner_names:', cleaner_names) print ('***text:', text) texts = tokenizer.tokenize(text) waves=[] for text in texts: seq = text_to_sequence(text, cleaner_names) print ('***seq:', seq) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] waves.append(wav) wavestack=waves[0] for wave in waves[1:]: wavestack=np.hstack((wavestack,wave)) out = io.BytesIO() audio.save_wav(wavestack, out) return out.getvalue()
def synthesize(self, text, mel_targets=None, reference_mel=None, reference_weight=None, alignment_path=None, reference_path=None, style_path=None, weight_path=None): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32), } if mel_targets is not None: mel_targets = np.expand_dims(mel_targets, 0) feed_dict.update({ self.model.mel_targets: np.asarray(mel_targets, dtype=np.float32) }) elif reference_mel is not None: reference_mel = np.expand_dims(reference_mel, 0) feed_dict.update({ self.model.reference_mel: np.asarray(reference_mel, dtype=np.float32) }) elif reference_weight is not None: feed_dict.update({ self.model.reference_weight: np.asarray(reference_weight, dtype=np.float32) }) wav, alignments, style_embeddings, style_weights = self.session.run( [ self.wav_output, self.alignments, self.style_embeddings, self.style_weights ], feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) end_point = audio.find_endpoint(wav) wav = wav[:end_point] out = io.BytesIO() audio.save_wav(wav, out) n_frame = int( end_point / (hparams.frame_shift_ms / 1000 * hparams.sample_rate)) + 1 text = '\n'.join(textwrap.wrap(text, 70, break_long_words=False)) plot.plot_alignment(alignments[:, :n_frame], alignment_path, info='%s' % (text)) plot.plot_weight(style_weights, weight_path) # np.save(reference_path, refer_embeddings) np.save(style_path, style_embeddings) return out.getvalue()
def synthesize(self, text, out): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] audio.save_wav(wav, out) return
def synthesize(self, text): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def synthesize(self, lab_name): lab = np.load(lab_name) lab = np.expand_dims(lab, axis=0) feed_dict = { self.model.inputs: lab, self.model.input_lengths: np.asarray([lab.shape[1]], dtype=np.int32), # change 0 to 1 or others based on the speaker self.model.speaker_ids: np.asarray([2], dtype=np.int32) } wav, mel_outputs = self.session.run( [self.wav_output, self.model.mel_outputs[0]], feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) _len = audio.find_endpoint(wav) wav = wav[:_len] _len = audio.find_endpoint(wav) wav = wav[:_len] mel_output = mel_output[:frames, :] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue(), mel_outputs
def synthesize(self, in_file): src_spectrogram = audio.spectrogram(in_file, num_src_freq=hparams.num_src_freq, frame_length_ms=hparams.src_frame_length_ms).astype(np.float32) feed_dict = { self.model.inputs: [np.asarray(src_spectrogram, dtype=np.float32)], self.model.input_lengths: np.asarray([len(src_spectrogram)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def synthesize(self, text): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } linears = self.session.run(self.model.linear_outputs[0], feed_dict=feed_dict) wav = audio.inv_spectrogram(linears.T) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def synthesize(self, text): # for demo_server cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) # print(type(out)) return out.getvalue() # returns bytes obj
def synthesize(self, input_path): s, sr = sf.read(input_path) spec = audio.melspectrogram(s).astype(np.float32).T feed_dict = { self.model.inputs: [np.asarray(spec, dtype=np.float32)], self.model.input_lengths: np.asarray([spec.shape[0]], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def synthesize(self, text): text = arpa.to_arpa(text) cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() #audio.save_wav(wav, out) audio.save_wav(wav, "/content/drive/MyDrive/voice_cloning/out_sample") print("finishhhhhhhhhhhhhhh") return out.getvalue()
def synthesize(self, text, base_path, idx): seq = text_to_sequence(text) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } input_seq, wav, alignment = self.session.run( [self.inputs, self.wav_output, self.alignments], feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) input_seq = sequence_to_text(input_seq) plot.plot_alignment(alignment, '%s-%d-align.png' % (base_path, idx), input_seq) return out.getvalue()
def synthesize(self, path_in, path_re, mel_targets=None, reference_mel=None, alignment_path=None): wav_in = audio.load_wav(path_in) wav_re = audio.load_wav(path_re) mel_in = audio.melspectrogram(wav_in).astype(np.float32) mel_re = audio.melspectrogram(wav_re).astype(np.float32) # print(mel_jp) feed_dict = { self.model.inputs: [mel_in.T], self.model.input_lengths: np.asarray([len(mel_in)], dtype=np.int32), self.model.inputs_jp: [mel_re.T], } # if mel_targets is not None: # mel_targets = np.expand_dims(mel_targets, 0) # print(reference_mel.shapex) # feed_dict.update({self.model.mel_targets: np.asarray(mel_targets, dtype=np.float32)}) # if reference_mel is not None: # reference_mel = np.expand_dims(reference_mel, 0) # print(reference_mel.shapex) # feed_dict.update({self.model.reference_mel: np.asarray(reference_mel, dtype=np.float32)}) wav_out, alignments = self.session.run( [self.wav_output, self.alignments], feed_dict=feed_dict) wav = audio.inv_preemphasis(wav_out) end_point = audio.find_endpoint(wav) wav = wav[:end_point] nowTime = datetime.datetime.now().strftime("%Y%m%d%H%M%S") # 生成当前时间 randomNum = random.randint(0, 100) # 生成的随机整数n,其中0<=n<=100 if randomNum <= 10: randomNum = str(0) + str(randomNum) uniqueNum = str(nowTime) + str(randomNum) out_dir = "static\\out\\" + uniqueNum + ".wav" out_name = uniqueNum + ".wav" audio.save_wav(wav, out_dir) out = io.BytesIO() audio.save_wav(wav, out) # n_frame = int(end_point / (hparams.frame_shift_ms / 1000* hparams.sample_rate)) + 1 # plot.plot_alignment(alignments[:,:n_frame], alignment_path, info='%s' % (path)) return out_dir, out_name
def synthesize(self, text): #print('synthesize:',text) cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] #text=sentence_to_pinyin(text) #print('text:',text) #print('cleaner_names:',cleaner_names) seq = text_to_sequence_zh(text, cleaner_names) print(seq) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def synthesize(self, text, title): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) cwd = os.getcwd() audio_dir = cwd + "/narration/saved_audio/" + title + ".wav" print(audio_dir) with open(audio_dir, "wb") as f: f.write(out.getvalue()) os.system("aplay " + audio_dir) return out.getvalue()
def synthesize(self, text): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] # g2p = G2p() c_text=text.split('|')[0] p_text=text.split('|')[1] c_seq = text_to_sequence(c_text, cleaner_names) p_seq = text_to_sequence(p_text, cleaner_names) feed_dict = { self.model.c_inputs: [np.asarray(c_seq, dtype=np.int32)], self.model.p_inputs: [np.asarray(p_seq, dtype=np.int32)], self.model.c_input_lengths: np.asarray([len(c_seq)], dtype=np.int32), self.model.p_input_lengths: np.asarray([len(p_seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def synthesize(self, text): with Synthesizer.mutex: if not Synthesizer.processing: Synthesizer.processing = True cleaner_names = [ x.strip() for x in hparams.cleaners.split(',') ] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) Synthesizer.processing = False return out.getvalue() else: return None
def synthesize(self, text, return_wav=False): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav, alignment = self.session.run([self.wav_output, self.alignment], feed_dict=feed_dict) audio_endpoint = audio.find_endpoint(wav) alignment_endpoint = find_alignment_endpoint(alignment.shape, audio_endpoint / len(wav)) wav = wav[:audio_endpoint] alignment = alignment[:, :alignment_endpoint] if return_wav: return wav, alignment out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue(), alignment
def synthesize(self, images_dir, output_wav_dir): for path, _, filenames in os.walk(images_dir): for i in trange(len(filenames)): test_file = filenames[i] if str.endswith(test_file, '.png'): base_file_name, _ = os.path.splitext(test_file) raw_image = imread(os.path.join(path, test_file), mode='RGB') processed_image = imresize(raw_image, (224, 224, 3)) feed_dict = { self.model.inputs: [np.asarray(processed_image, dtype=np.float32)], } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] audio_out_path = os.path.join( output_wav_dir, 'eval-{}.wav'.format(base_file_name)) audio.save_wav(wav, audio_out_path) print('Wav - {} generated successfully!'.format( audio_out_path))
def _pml_to_wav(pml_features, cfg, shift=0.005, dftlen=4096, nm_cont=False, verbose_level=0, mean_norm=None, std_norm=None, spec_type='mcep', pp_mcep=False, find_endpoint=False, threshold_db=0): # get the mean and variance, and denormalise if mean_norm is not None and std_norm is not None: std_tiled = np.tile(std_norm, (pml_features.shape[0], 1)) mean_tiled = np.tile(mean_norm, (pml_features.shape[0], 1)) pml_features = pml_features * std_tiled + mean_tiled # f0s is from flf0 f0 = pml_features[:, cfg.acoustic_start_index['lf0']:cfg. acoustic_start_index['lf0'] + cfg.acoustic_in_dimension_dict['lf0']] f0 = np.squeeze(f0) # remove the extra 1 dimension here f0[f0 > 0] = np.exp(f0[f0 > 0]) ts = shift * np.arange(len(f0)) f0s = np.vstack((ts, f0)).T # spec comes from fmcep or something else fwbnd if spec_type == 'mcep': mcep = pml_features[:, cfg.acoustic_start_index['mgc']:cfg. acoustic_start_index['mgc'] + cfg.acoustic_in_dimension_dict['mgc']] if pp_mcep: from lib.merlin import generate_pp mcep = generate_pp.mcep_postproc_sptk(mcep, cfg.wav_sr, dftlen=dftlen) spec = sp.mcep2spec(mcep, sp.bark_alpha(cfg.wav_sr), dftlen) elif spec_type == 'fwbnd': compspec = pml_features[:, cfg.acoustic_start_index['mgc']:cfg. acoustic_start_index['mgc'] + cfg.acoustic_in_dimension_dict['mgc']] spec = np.exp(sp.fwbnd2linbnd(compspec, cfg.wav_sr, dftlen)) if pp_mcep: from lib.merlin import generate_pp mcep = sp.spec2mcep(spec * cfg.wav_sr, sp.bark_alpha(cfg.wav_sr), 256) mcep_pp = generate_pp.mcep_postproc_sptk(mcep, cfg.wav_sr, dftlen=dftlen) spec = sp.mcep2spec( mcep_pp, sp.bark_alpha(cfg.wav_sr), dftlen=dftlen) / cfg.wav_sr # NM comes from bap fwnm = pml_features[:, cfg.acoustic_start_index['bap']:cfg. acoustic_start_index['bap'] + cfg.acoustic_in_dimension_dict['bap']] nm = sp.fwbnd2linbnd(fwnm, cfg.wav_sr, dftlen) # use standard PML vocoder wav = synthesize(cfg.wav_sr, f0s, spec, NM=nm, nm_cont=nm_cont, verbose=verbose_level) # clip the wav to the endpoint if required if find_endpoint: wav = wav[:audio.find_endpoint(wav, threshold_db=threshold_db)] # return the raw wav data return wav