def synthesize(self, text, mel_targets=None, reference_mel=None): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32), } if mel_targets is not None: mel_targets = np.expand_dims(mel_targets, 0) feed_dict.update({ self.model.mel_targets: np.asarray(mel_targets, dtype=np.float32) }) if reference_mel is not None: reference_mel = np.expand_dims(reference_mel, 0) feed_dict.update({ self.model.reference_mel: np.asarray(reference_mel, dtype=np.float32) }) wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def synthesize(self, text): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] print ('***cleaner_names:', cleaner_names) print ('***text:', text) texts = tokenizer.tokenize(text) waves=[] for text in texts: seq = text_to_sequence(text, cleaner_names) print ('***seq:', seq) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] waves.append(wav) wavestack=waves[0] for wave in waves[1:]: wavestack=np.hstack((wavestack,wave)) out = io.BytesIO() audio.save_wav(wavestack, out) return out.getvalue()
def synthesize(self, text, mel_targets=None, reference_mel=None, reference_weight=None, alignment_path=None, reference_path=None, style_path=None, weight_path=None): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32), } if mel_targets is not None: mel_targets = np.expand_dims(mel_targets, 0) feed_dict.update({ self.model.mel_targets: np.asarray(mel_targets, dtype=np.float32) }) elif reference_mel is not None: reference_mel = np.expand_dims(reference_mel, 0) feed_dict.update({ self.model.reference_mel: np.asarray(reference_mel, dtype=np.float32) }) elif reference_weight is not None: feed_dict.update({ self.model.reference_weight: np.asarray(reference_weight, dtype=np.float32) }) wav, alignments, style_embeddings, style_weights = self.session.run( [ self.wav_output, self.alignments, self.style_embeddings, self.style_weights ], feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) end_point = audio.find_endpoint(wav) wav = wav[:end_point] out = io.BytesIO() audio.save_wav(wav, out) n_frame = int( end_point / (hparams.frame_shift_ms / 1000 * hparams.sample_rate)) + 1 text = '\n'.join(textwrap.wrap(text, 70, break_long_words=False)) plot.plot_alignment(alignments[:, :n_frame], alignment_path, info='%s' % (text)) plot.plot_weight(style_weights, weight_path) # np.save(reference_path, refer_embeddings) np.save(style_path, style_embeddings) return out.getvalue()
def synthesize(self, text): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) out = io.BytesIO() audio.save_wav(audio.inv_preemphasis(wav), out) return out.getvalue()
def synthesize(self, text, out): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] audio.save_wav(wav, out) return
def synthesize(self, text): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def synthesize(self, in_file): src_spectrogram = audio.spectrogram(in_file, num_src_freq=hparams.num_src_freq, frame_length_ms=hparams.src_frame_length_ms).astype(np.float32) feed_dict = { self.model.inputs: [np.asarray(src_spectrogram, dtype=np.float32)], self.model.input_lengths: np.asarray([len(src_spectrogram)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def synthesize(self, text): # 将中文转换为注音字符 text = Pinyin().get_pinyin(text, " ", tone_marks='numbers') cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] # 注音字符到序列 seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)} wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def synthesize(self, input_path): s, sr = sf.read(input_path) spec = audio.melspectrogram(s).astype(np.float32).T feed_dict = { self.model.inputs: [np.asarray(spec, dtype=np.float32)], self.model.input_lengths: np.asarray([spec.shape[0]], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def synthesize(self, text): # for demo_server cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) # print(type(out)) return out.getvalue() # returns bytes obj
def synthesize(self, text): text = arpa.to_arpa(text) cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() #audio.save_wav(wav, out) audio.save_wav(wav, "/content/drive/MyDrive/voice_cloning/out_sample") print("finishhhhhhhhhhhhhhh") return out.getvalue()
def main(args): os.makedirs(args.model_dir, exist_ok=True) estimator = tf.estimator.Estimator( model_fn=model_fn, model_dir=args.model_dir, params=hparams, config=RunConfig( save_summary_steps=args.summary_interval, save_checkpoints_steps=args.checkpoint_interval, session_config=SESS_CFG, # log_step_count_steps=100, keep_checkpoint_max=2)) if args.mode == 'train': os.makedirs(args.data_dir, exist_ok=True) estimator.train(input_fn=lambda: train_input_fn(args.data_dir)) elif args.mode == 'predict': assert len(args.texts), "No text to predict" results = estimator.predict( input_fn=lambda: predict_input_fn(args.texts)) for idx, wav in enumerate(results): wav = inv_preemphasis(wav) # wav = wav[:find_endpoint(wav)] # sp.save('wav_{}.npy'.format(idx), wav, allow_pickle=False) save_wav(wav, 'output_{}.wav'.format(idx)) # break elif args.mode == 'export': os.makedirs(args.export_dir, exist_ok=True) estimator.export_saved_model( args.export_dir, tf.estimator.export.build_raw_serving_input_receiver_fn( { 'inputs': tf.placeholder( dtype=tf.int32, shape=(None, None), name='inputs'), 'lengths': tf.placeholder( dtype=tf.int32, shape=(None, ), name='lengths'), }, default_batch_size=None), # assets_extra=None, # as_text=False, # checkpoint_path=None, # experimental_mode=ModeKeys.PREDICT ) else: raise KeyError('Unknown Mode <{}>'.format(args.mode))
def synthesize(self, text, base_path, idx): seq = text_to_sequence(text) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } input_seq, wav, alignment = self.session.run( [self.inputs, self.wav_output, self.alignments], feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) input_seq = sequence_to_text(input_seq) plot.plot_alignment(alignment, '%s-%d-align.png' % (base_path, idx), input_seq) return out.getvalue()
def synthesize(self, path_in, path_re, mel_targets=None, reference_mel=None, alignment_path=None): wav_in = audio.load_wav(path_in) wav_re = audio.load_wav(path_re) mel_in = audio.melspectrogram(wav_in).astype(np.float32) mel_re = audio.melspectrogram(wav_re).astype(np.float32) # print(mel_jp) feed_dict = { self.model.inputs: [mel_in.T], self.model.input_lengths: np.asarray([len(mel_in)], dtype=np.int32), self.model.inputs_jp: [mel_re.T], } # if mel_targets is not None: # mel_targets = np.expand_dims(mel_targets, 0) # print(reference_mel.shapex) # feed_dict.update({self.model.mel_targets: np.asarray(mel_targets, dtype=np.float32)}) # if reference_mel is not None: # reference_mel = np.expand_dims(reference_mel, 0) # print(reference_mel.shapex) # feed_dict.update({self.model.reference_mel: np.asarray(reference_mel, dtype=np.float32)}) wav_out, alignments = self.session.run( [self.wav_output, self.alignments], feed_dict=feed_dict) wav = audio.inv_preemphasis(wav_out) end_point = audio.find_endpoint(wav) wav = wav[:end_point] nowTime = datetime.datetime.now().strftime("%Y%m%d%H%M%S") # 生成当前时间 randomNum = random.randint(0, 100) # 生成的随机整数n,其中0<=n<=100 if randomNum <= 10: randomNum = str(0) + str(randomNum) uniqueNum = str(nowTime) + str(randomNum) out_dir = "static\\out\\" + uniqueNum + ".wav" out_name = uniqueNum + ".wav" audio.save_wav(wav, out_dir) out = io.BytesIO() audio.save_wav(wav, out) # n_frame = int(end_point / (hparams.frame_shift_ms / 1000* hparams.sample_rate)) + 1 # plot.plot_alignment(alignments[:,:n_frame], alignment_path, info='%s' % (path)) return out_dir, out_name
def synthesize(self, text): #print('synthesize:',text) cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] #text=sentence_to_pinyin(text) #print('text:',text) #print('cleaner_names:',cleaner_names) seq = text_to_sequence_zh(text, cleaner_names) print(seq) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def synthesize(self, text, title): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) cwd = os.getcwd() audio_dir = cwd + "/narration/saved_audio/" + title + ".wav" print(audio_dir) with open(audio_dir, "wb") as f: f.write(out.getvalue()) os.system("aplay " + audio_dir) return out.getvalue()
def synthesize(self, text): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] # g2p = G2p() c_text=text.split('|')[0] p_text=text.split('|')[1] c_seq = text_to_sequence(c_text, cleaner_names) p_seq = text_to_sequence(p_text, cleaner_names) feed_dict = { self.model.c_inputs: [np.asarray(c_seq, dtype=np.int32)], self.model.p_inputs: [np.asarray(p_seq, dtype=np.int32)], self.model.c_input_lengths: np.asarray([len(c_seq)], dtype=np.int32), self.model.p_input_lengths: np.asarray([len(p_seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def synthesize(self, text): with Synthesizer.mutex: if not Synthesizer.processing: Synthesizer.processing = True cleaner_names = [ x.strip() for x in hparams.cleaners.split(',') ] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) Synthesizer.processing = False return out.getvalue() else: return None
def synthesize(self, images_dir, output_wav_dir): for path, _, filenames in os.walk(images_dir): for i in trange(len(filenames)): test_file = filenames[i] if str.endswith(test_file, '.png'): base_file_name, _ = os.path.splitext(test_file) raw_image = imread(os.path.join(path, test_file), mode='RGB') processed_image = imresize(raw_image, (224, 224, 3)) feed_dict = { self.model.inputs: [np.asarray(processed_image, dtype=np.float32)], } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] audio_out_path = os.path.join( output_wav_dir, 'eval-{}.wav'.format(base_file_name)) audio.save_wav(wav, audio_out_path) print('Wav - {} generated successfully!'.format( audio_out_path))
def synthesize(self, lab_name): lab = np.load(lab_name) lab = np.expand_dims(lab, axis=0) feed_dict = { self.model.inputs: lab, self.model.input_lengths: np.asarray([lab.shape[1]], dtype=np.int32), # change 0 to 1 or others based on the speaker self.model.speaker_ids: np.asarray([2], dtype=np.int32) } wav, mel_outputs = self.session.run( [self.wav_output, self.model.mel_outputs[0]], feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) _len = audio.find_endpoint(wav) wav = wav[:_len] _len = audio.find_endpoint(wav) wav = wav[:_len] mel_output = mel_output[:frames, :] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue(), mel_outputs
def synthesize(self, text, identity, path=None, path_align=None): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence2(text, cleaner_names)[:-1] print(seq) print(sequence_to_text2(seq)) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32), self.model.identities: np.asarray([identity], dtype=np.int32), } wav, alignment = self.session.run([self.wav_output, self.alignment], feed_dict=feed_dict) if path_align is not None: plot.plot_alignment(alignment, path_align) wav = audio.inv_preemphasis(wav) #wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() if path is not None: audio.save_wav(wav, path) else: audio.save_wav(wav, './1.wav') return out.getvalue()
def synthesize(self, text, mel_spec): cleaner_names = [x.strip() for x in self.hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) if self.hparams.enable_fv1 or self.hparams.enable_fv2: feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32), self.net.data2: mel_spec } else: feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32), } wav, alignment = self.session.run( [self.wav_output, self.model.alignments], feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) alignment = alignment[0] #wav = wav[:audio.find_endpoint(wav)] #out = io.BytesIO() #audio.save_wav(wav, out) return wav, alignment