def synthesis_griffin_lim(text_seq, model, alpha=1.0, mode="", num=100): text = text_to_sequence(text_seq, hp.hparams.text_cleaners) text = text + [0] text = np.stack([np.array(text)]) text = torch.from_numpy(text).long().to(device) pos = torch.stack([torch.Tensor([i + 1 for i in range(text.size(1))])]) pos = pos.long().to(device) start = int(round(time.time() * 1000)) model.eval() with torch.no_grad(): mel, mel_postnet = model(text, pos, alpha=alpha) end = int(round(time.time() * 1000)) tt = end - start print("Total - making mel : %d ms\n" % tt) mel = mel[0].cpu().numpy().T mel_postnet = mel_postnet[0].cpu().numpy().T #plot_data([mel, mel_postnet]) wav = audio.inv_mel_spectrogram(mel_postnet) print("Wav Have Been Synthesized.\n") if not os.path.exists("results"): os.mkdir("results") new_name = text_seq.replace(" ", "_") audio.save_wav( wav, os.path.join("results", new_name + str(num) + mode + ".wav")) return new_name
def synthesize(self, text, speaker_id=0): """Convert text to speech waveform given a deepvoice3 model. Args: text (str) : Input text to be synthesized p (float) : Replace word to pronounciation if p > 0. Default is 0. """ sequence = np.array(self._frontend.text_to_sequence(text)) sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0) text_positions = torch.arange(1, sequence.size(-1) + 1).unsqueeze(0).long() text_positions = Variable(text_positions) speaker_ids = None if speaker_id is None else Variable(torch.LongTensor([speaker_id])) if self.use_cuda: sequence = sequence.cuda() text_positions = text_positions.cuda() speaker_ids = None if speaker_ids is None else speaker_ids.cuda() # Greedy decoding mel_outputs, linear_outputs, alignments, done = self.model( sequence, text_positions=text_positions, speaker_ids=speaker_ids) linear_output = linear_outputs[0].cpu().data.numpy() # Predicted audio signal waveform = audio.inv_spectrogram(linear_output.T) out = io.BytesIO() audio.save_wav(waveform, out) return out
def eval_model(global_step, writer, model, checkpoint_dir, ismultispeaker): # harded coded texts = [ "Scientists at the CERN laboratory say they have discovered a new particle.", "There's a way to measure the acute emotional intelligence that has never gone out of style.", "President Trump met with other leaders at the Group of 20 conference.", "Generative adversarial network or variational auto-encoder.", "Please call Stella.", "Some have accepted this as a miracle without any physical explanation.", ] import synthesis synthesis._frontend = _frontend eval_output_dir = join(checkpoint_dir, "eval") os.makedirs(eval_output_dir, exist_ok=True) # hard coded speaker_ids = [0, 1, 10] if ismultispeaker else [None] for speaker_id in speaker_ids: speaker_str = "multispeaker{}".format( speaker_id) if speaker_id is not None else "single" for idx, text in enumerate(texts): signal, alignment, _, mel = synthesis.tts(model, text, p=0, speaker_id=speaker_id, fast=False) signal /= np.max(np.abs(signal)) # Alignment path = join( eval_output_dir, "step{:09d}_text{}_{}_alignment.png".format( global_step, idx, speaker_str)) save_alignment(path, alignment) tag = "eval_averaged_alignment_{}_{}".format(idx, speaker_str) writer.add_image( tag, np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255), global_step) # Mel writer.add_image( "(Eval) Predicted mel spectrogram text{}_{}".format( idx, speaker_str), prepare_spec_image(mel), global_step) # Audio path = join( eval_output_dir, "step{:09d}_text{}_{}_predicted.wav".format( global_step, idx, speaker_str)) audio.save_wav(signal, path) try: writer.add_audio("(Eval) Predicted audio signal {}_{}".format( idx, speaker_str), signal, global_step, sample_rate=fs) except Exception as e: warn(str(e)) pass
def get_tacotron2_alignment_test(text_seq): hparams = hp_tacotron2.create_hparams() hparams.sampling_rate = hp.sample_rate checkpoint_path = os.path.join( "Tacotron2", os.path.join("pre_trained_model", "tacotron2_statedict.pt")) tacotron2 = train_tacotron2.load_model(hparams) tacotron2.load_state_dict(torch.load(checkpoint_path)["state_dict"]) _ = tacotron2.cuda().eval().half() sequence = np.array(text_to_sequence(text_seq, hp.text_cleaners))[None, :] print("sequence size", np.shape(sequence)) sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() mel, mel_postnet, _, alignment = tacotron2.inference(sequence) plot_data((mel.float().data.cpu().numpy()[0], mel_postnet.float().data.cpu().numpy()[0], alignment.float().data.cpu().numpy()[0].T)) wav = audio.inv_mel_spectrogram(mel_postnet.float().data.cpu().numpy()[0]) audio.save_wav(wav, "test.wav") alignment = alignment.float().data.cpu().numpy()[0] print("alignment size", np.shape(alignment)) get_D(alignment) return alignment
def synth(self, text, save=None): inp = clean(text) print(inp) x = [self.c2i[c] for c in inp + 'E'] x += [0] * (hp.maxlen - len(x)) x = np.array(x) x = x.reshape(1, -1) with self.melsession.as_default(): preds = np.zeros((1, 1, hp.n_mels), np.float32) cnt = hp.Tyr for j in range(hp.Tyr): sys.stdout.write('\rProcessing %d' % j) sys.stdout.flush() _preds, a = self.melsession.run( [self.melmodel.mel_output, self.melmodel.A], { self.melmodel.text: x, self.melmodel.mel: preds }) preds = np.concatenate((np.zeros((1, 1, hp.n_mels)), _preds), axis=1) cnt -= 1 if np.argmax(a[0, :, -1]) >= len(inp) - 3: cnt = min(cnt, 10) if cnt <= 0: break with self.magsession.as_default(): wav = self.magsession.run(self.magmodel.wav_output, {self.magmodel.mel: preds}) wav = audio.inv_preemphasis(wav) if save is not None: audio.save_wav(wav[0], save) else: out = io.BytesIO() audio.save_wav(wav[0], out) return out.getvalue()
def make_prediction(self, x, y, epoch): _, target_mel, target_linear, _ = y _, mel, linear, _, alignment = self.tacotron._predict_with_target(*x) mel = mel[0] linear = linear[0] alignment = alignment[0] step = (epoch + 1) * self.steps_per_epoch mel_filename = os.path.join(self.mel_dir, "step-{}.npy".format(step)) linear_filename = os.path.join(self.linear_dir, "step-{}.npy".format(step)) plot_mel_filename = os.path.join(self.plot_dir, "mel_step-{}.png".format(step)) plot_linear_filename = os.path.join(self.plot_dir, "linear_step-{}.png".format(step)) plot_align_filename = os.path.join(self.plot_dir, "align_step-{}.png".format(step)) wav_mel_filename = os.path.join(self.wav_dir, "wav_from_mel_step-{}.wav".format(step)) wav_linear_filename = os.path.join(self.wav_dir, "wav_from_linear_step-{}.wav".format(step)) wav_from_linear = self.tacotron.wav_from_linear(linear) wav_from_mel = self.tacotron.wav_from_mel(mel) np.save(mel_filename, mel) np.save(linear_filename, linear) audio.save_wav(wav_from_mel, wav_mel_filename, self.tacotron.audio_rate) audio.save_wav(wav_from_linear, wav_linear_filename, self.tacotron.audio_rate) plot_alignment(alignment, title="Alignments", filename=plot_align_filename, show=False, fontsize=14) plot_spectrogram(mel, title="Mel spectrogram", filename=plot_mel_filename, show=False, target_spectrogram=target_mel[0]) plot_spectrogram(linear, title="Linear spectrogram", filename=plot_linear_filename, show=False, target_spectrogram=target_linear[0])
def synthesis_griffin_lim(text_seq, model): text = text_to_sequence(text_seq, hp.text_cleaners) text = text + [0] text = np.stack([np.array(text)]) text = torch.from_numpy(text).long().to(device) pos = torch.stack([torch.Tensor([i + 1 for i in range(text.size(1))])]) pos = pos.long().to(device) dec_pos = torch.stack( [torch.Tensor([i + 1 for i in range(int(5.8 * text.size(1)))])]) dec_pos = dec_pos.long().to(device) model.eval() with torch.no_grad(): mel, mel_postnet = model(text, pos, dec_pos) mel = mel[0].cpu().numpy().T mel_postnet = mel_postnet[0].cpu().numpy().T plot_data([mel, mel_postnet]) wav = audio.inv_mel_spectrogram(mel_postnet) print("Wav Have Been Synthesized.") if not os.path.exists("results"): os.mkdir("results") audio.save_wav(wav, os.path.join("results", text_seq + ".wav"))
def synthesis(text, num): m = Model() # m_post = ModelPostNet() m.load_state_dict(load_checkpoint(num, "transformer")) # m_post.load_state_dict(load_checkpoint(args.restore_step2, "postnet")) text = np.asarray(text_to_sequence(text, [hp.cleaners])) text = t.LongTensor(text).unsqueeze(0) text = text.cuda() mel_input = t.zeros([1, 1, 80]).cuda() pos_text = t.arange(1, text.size(1) + 1).unsqueeze(0) pos_text = pos_text.cuda() m = m.cuda() # m_post = m_post.cuda() m.train(False) # m_post.train(False) # pbar = tqdm(range(args.max_len)) with t.no_grad(): for _ in range(1000): pos_mel = t.arange(1, mel_input.size(1) + 1).unsqueeze(0).cuda() mel_pred, postnet_pred, attn, stop_token, _, attn_dec = m.forward( text, mel_input, pos_text, pos_mel) mel_input = t.cat([mel_input, postnet_pred[:, -1:, :]], dim=1) # mag_pred = m_post.forward(postnet_pred) # wav = spectrogram2wav(mag_pred.squeeze(0).cpu().numpy()) mel_postnet = postnet_pred[0].cpu().numpy().T plot_data([mel_postnet for _ in range(2)]) wav = audio.inv_mel_spectrogram(mel_postnet) wav = wav[0:audio.find_endpoint(wav)] audio.save_wav(wav, "result.wav")
def save_states(self, global_epoch, mel_outputs, linear_outputs, ling, mel, linear, lengths): print("Save intermediate states at epoch {}".format(global_epoch)) # idx = np.random.randint(0, len(input_lengths)) idx = min(1, len(lengths) - 1) # Predicted mel spectrogram if mel_outputs is not None: mel_output = mel_outputs[idx].cpu().data.numpy() mel_output = prepare_spec_image(audio._denormalize(mel_output)) self.writer.add_image("Predicted mel spectrogram", mel_output, global_epoch) # Predicted spectrogram if linear_outputs is not None: linear_output = linear_outputs[idx].cpu().data.numpy() spectrogram = prepare_spec_image(audio._denormalize(linear_output)) self.writer.add_image("Predicted spectrogram", spectrogram, global_epoch) # Predicted audio signal signal = audio.inv_spectrogram(linear_output.T) signal /= np.max(np.abs(signal)) path = join(self.checkpoint_dir, "epoch{:09d}_predicted.wav".format(global_epoch)) try: self.writer.add_audio("Predicted audio signal", signal, global_epoch, sample_rate=self.fs) except Exception as e: warn(str(e)) pass audio.save_wav(signal, path) # Target mel spectrogram if mel_outputs is not None: #ling = ling[idx].cpu().data.numpy() #mel = prepare_spec_image(audio._denormalize(mel)) #self.writer.add_image("Source mel spectrogram", ling, global_epoch) mel = mel[idx].cpu().data.numpy() mel = prepare_spec_image(audio._denormalize(mel)) self.writer.add_image("Target mel spectrogram", mel, global_epoch) if linear_outputs is not None: linear = linear[idx].cpu().data.numpy() spectrogram = prepare_spec_image(audio._denormalize(linear)) self.writer.add_image("Target spectrogram", spectrogram, global_epoch) # Target audio signal signal = audio.inv_spectrogram(linear.T) signal /= np.max(np.abs(signal)) try: self.writer.add_audio("Target audio signal", signal, global_epoch, sample_rate=self.fs) except Exception as e: warn(str(e)) pass
def test(): wavs_path = os.path.join("data", "LJSpeech-1.1") wavs_path = os.path.join(wavs_path, "wavs") wav_path = os.path.join(wavs_path, "LJ001-0001.wav") wav = audio.load_wav(wav_path) mel_spec = audio.melspectrogram(wav) wav_after_inv = audio.inv_mel_spectrogram(mel_spec) audio.save_wav(wav_after_inv, "test.wav")
def predict(self, liste_phrases, out_dir, min_iter=5, max_iter=100000): mel_dir = os.path.join(out_dir, 'mels') linear_dir = os.path.join(out_dir, 'linear') plot_dir = os.path.join(out_dir, 'plots') wav_dir = os.path.join(out_dir, 'wavs') os.makedirs(mel_dir, exist_ok=True) os.makedirs(linear_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) outputs = [] for i, phrase in tqdm(enumerate(liste_phrases)): mel, linear, alignment = self._predict(phrase, min_iter=min_iter, max_iter=max_iter) mel_filename = os.path.join(mel_dir, "pred_{}.npy".format(i)) linear_filename = os.path.join(linear_dir, "pred_{}.npy".format(i)) plot_mel_filename = os.path.join( plot_dir, "mel_spectrogram_{}.png".format(i)) plot_linear_filename = os.path.join( plot_dir, "linear_spectrogram_{}.png".format(i)) plot_align_filename = os.path.join(plot_dir, "alignments_{}.png".format(i)) wav_mel_filename = os.path.join(wav_dir, "wav_from_mel_{}.wav".format(i)) wav_linear_filename = os.path.join( wav_dir, "wav_from_linear_{}.wav".format(i)) wav_from_linear = self.wav_from_linear(linear) wav_from_mel = self.wav_from_mel(mel) np.save(mel_filename, mel) np.save(linear_filename, linear) audio.save_wav(wav_from_mel, wav_mel_filename, self.audio_rate) audio.save_wav(wav_from_linear, wav_linear_filename, self.audio_rate) plot_alignment(alignment, title="Alignments for :\n{}".format(phrase), filename=plot_align_filename, show=False, fontsize=16) plot_spectrogram(mel, title="Mel spectrogram", filename=plot_mel_filename, show=False) plot_spectrogram(linear, title="Linear spectrogram", filename=plot_linear_filename, show=False) outputs.append((mel, linear, alignment)) return outputs
def tts(model, text, file_path, p=0, speaker_id=None, fast=True): from synthesis import tts as _tts import audio waveform, alignment, spectrogram, mel = _tts(model, text, p, speaker_id, fast) # 22050, 353 kbps, 16 bit, mono audio.save_wav(waveform, file_path)
def synthesis_waveglow(mel, waveglow, num, alpha=1.0): wav = waveglow.infer(mel, sigma=0.666) print("Wav Have Been Synthesized.") if not os.path.exists("results"): os.mkdir("results") audio.save_wav(wav[0].data.cpu().numpy(), os.path.join("results", str(num) + ".wav"))
def eval_model(global_step, writer, model, checkpoint_dir, ismultispeaker): # harded coded texts = [ "This is Informatics Institute of Technology evaluation sentence for Text to speeh for sinhala" ] import synthesis synthesis._frontend = _frontend eval_output_dir = join(checkpoint_dir, "eval") os.makedirs(eval_output_dir, exist_ok=True) # hard coded speaker_ids = [0, 1, 10] if ismultispeaker else [None] for speaker_id in speaker_ids: speaker_str = "multispeaker{}".format( speaker_id) if speaker_id is not None else "single" for idx, text in enumerate(texts): signal, alignment, _, mel = synthesis.tts(model, text, p=0, speaker_id=speaker_id, fast=False) signal /= np.max(np.abs(signal)) # Alignment path = join( eval_output_dir, "step{:09d}_text{}_{}_alignment.png".format( global_step, idx, speaker_str)) save_alignment(path, alignment) tag = "eval_averaged_alignment_{}_{}".format(idx, speaker_str) writer.add_image( tag, np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255), global_step) # Mel writer.add_image( "(Eval) Predicted mel spectrogram text{}_{}".format( idx, speaker_str), prepare_spec_image(mel), global_step) # Audio path = join( eval_output_dir, "step{:09d}_text{}_{}_predicted.wav".format( global_step, idx, speaker_str)) audio.save_wav(signal, path) try: writer.add_audio("(Eval) Predicted audio signal {}_{}".format( idx, speaker_str), signal, global_step, sample_rate=fs) except Exception as e: warn(str(e)) pass
def eval_model(device, model, global_step, logs_dir, ismultispeaker): """Evaluate the model """ import synthesis # Hardcoded sentences for evaluation texts = [ "Scientists at the CERN laboratory say they have discovered a new particle.", "There's a way to measure the acute emotional intelligence that has never gone out of style.", "President Trump met with other leaders at the Group of Twenty conference.", "Generative adversarial network or variational auto-encoder.", "Please call Stella.", "Some have accepted this as a miracle without any physical explanation.", ] eval_output_dir = join(logs_dir, "eval") os.makedirs(eval_output_dir, exist_ok=True) eval_alignment_dir = join(eval_output_dir, "alignment") os.makedirs(eval_alignment_dir, exist_ok=True) eval_wav_dir = join(eval_output_dir, "wavs") os.makedirs(eval_wav_dir, exist_ok=True) # Prepare model for evaluation model_eval = build_model().to(device) model_eval.load_state_dict(model.state_dict()) # hard coded speaker_ids = [0, 1, cfg.n_speakers - 1] if ismultispeaker else [None] for speaker_id in speaker_ids: speaker_str = "multispeaker{}".format( speaker_id) if speaker_id is not None else "single" for idx, text in enumerate(texts): signal, alignment, _, _ = synthesis.tts(model_eval, text, speaker_id=speaker_id, fast=True) signal /= np.max(np.abs(signal)) # Alignment path = join( eval_alignment_dir, f"step{global_step:09d}_text{idx}_{speaker_str}_alignment.png") save_alignment(path, alignment) # Audio path = join( eval_wav_dir, f"step{global_step:09d}_text{idx}_{speaker_str}_predicted.wav") audio.save_wav(signal, path)
def save_states(global_step, mel_outputs, linear_outputs, attn, y, input_lengths, checkpoint_dir=None): print("Save intermediate states at step {}".format(global_step)) # idx = np.random.randint(0, len(input_lengths)) idx = min(1, len(input_lengths) - 1) input_length = input_lengths[idx] # Alignment # Multi-hop attention if attn.dim() == 4: for i, alignment in enumerate(attn): alignment_dir = join(checkpoint_dir, "alignment_layer{}".format(i + 1)) os.makedirs(alignment_dir, exist_ok=True) path = join( alignment_dir, "step{:09d}_layer_{}_alignment.png".format(global_step, i + 1)) alignment = alignment[idx].cpu().data.numpy() save_alignment(path, alignment) # Save averaged alignment alignment_dir = join(checkpoint_dir, "alignment_ave") os.makedirs(alignment_dir, exist_ok=True) path = join(alignment_dir, "step{:09d}_alignment.png".format(global_step)) alignment = attn.mean(0)[idx].cpu().data.numpy() save_alignment(path, alignment) else: assert False # Predicted spectrogram path = join(checkpoint_dir, "step{:09d}_predicted_spectrogram.png".format(global_step)) linear_output = linear_outputs[idx].cpu().data.numpy() save_spectrogram(path, linear_output) # Predicted audio signal signal = audio.inv_spectrogram(linear_output.T) path = join(checkpoint_dir, "step{:09d}_predicted.wav".format(global_step)) audio.save_wav(signal, path) # Target spectrogram path = join(checkpoint_dir, "step{:09d}_target_spectrogram.png".format(global_step)) linear_output = y[idx].cpu().data.numpy() save_spectrogram(path, linear_output)
def inference(args): hparams = create_hparams() sentences = get_sentences(args) # sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)] model = load_model(hparams) model.load_state_dict(torch.load(args.checkpoint)['state_dict']) model.cuda().eval() #.half() test_set = TextMelLoaderEval(sentences, hparams) test_collate_fn = TextMelCollateEval(hparams) test_sampler = DistributedSampler( valset) if hparams.distributed_run else None test_loader = DataLoader(test_set, num_workers=0, sampler=test_sampler, batch_size=hparams.synth_batch_size, pin_memory=False, drop_last=True, collate_fn=test_collate_fn) # taco_stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, sampling_rate=hparams.sampling_rate) T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else ( 0, hparams.max_abs_value) with torch.no_grad(): for i, batch in enumerate(test_loader): print("CHECK batch", i, batch) mel_outputs, mel_outputs_postnet, _, alignments = model.inference( batch) print('synthesize!!!', mel_outputs) print('synthesize!!!', mel_outputs_postnet) mels = mel_outputs_postnet[0].cpu().numpy() print('CHECK MEL SHAPE:', mels.shape) mel_path = os.path.join(args.out_filename, 'sentence_{}_mel.npy'.format(i)) # mels = [mel for gpu_mels in mels for mel in mel_outputs] mels = np.clip(mels, T2_output_range[0], T2_output_range[1]) np.save(mel_path, mels, allow_pickle=False) print('CHECK MEL SHAPE:', mels.shape) audio_path = os.path.join(args.out_filename, 'sentence_{}.wav'.format(i)) wav = audio.inv_mel_spectrogram(mels, hparams) audio.save_wav(wav, audio_path, sr=hparams.sampling_rate)
def text_to_speech(text, speaker_id=-1): kwargs = {} if speaker_id >= 0: kwargs["speaker_id"] = speaker_id waveform, alignment, spectrogram, mel = tts(model, text, fast=False, **kwargs) with tempfile.SpooledTemporaryFile() as f: audio.save_wav(waveform, f) f.seek(0) return f.read()
def _process_utterance(out_dir, index, speaker_id, wav_path, text): sr = hparams.sample_rate filename = os.path.basename(wav_path).replace('.wav', '') # Load the audio to a numpy array: wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab") # Trim silence from hts labels if available if exists(lab_path): labels = hts.load(lab_path) b = int(start_at(labels) * 1e-7 * sr) e = int(end_at(labels) * 1e-7 * sr) wav = wav[b:e] wav, _ = librosa.effects.trim(wav, top_db=25) # Librosa trim seems to cut off the ending part of speech else: wav, _ = librosa.effects.trim(wav, top_db=15) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Save trimmed wav save_wav_path = re.sub('wav48', 'wav_trim_22050', wav_path) dir = os.path.dirname(save_wav_path) if not os.path.exists(dir): os.system('mkdir {} -p'.format(dir)) audio.save_wav(wav, save_wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = '{}-spec.npy'.format(filename) mel_filename = '{}-mel.npy'.format(filename) np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
def synthesis_griffin_lim(text_seq, model, alpha=1.0, mode="", num=100, check=True): text_seq = text_seq[:-1] text = text_to_sequence(text_seq, hp.hparams.text_cleaners) text = text + [0] text = np.stack([np.array(text)]) text = torch.from_numpy(text).long().to(device) sequence = np.array(text_to_sequence(text_seq, hp.hparams.text_cleaners))[None, 1] pos = torch.stack([torch.Tensor([i + 1 for i in range(text.size(1))])]) pos = pos.long().to(device) model.eval() with torch.no_grad(): mel, mel_postnet = model(text, pos, alpha=alpha) if not os.path.exists("results_kor_0730_nam_95000"): os.mkdir("results_kor_0730_nam_95000") new_name = text_seq.replace(" ", "_") new_name = new_name.replace("?", "_") new_name = new_name[:-1] new_name2 = new_name + str(num) + mode + ".wav" new_name3 = "results_kor_0730_nam_95000/" + new_name2 mel = mel[0].cpu().numpy().T mel_postnet = mel_postnet[0].cpu().numpy().T plot_data([mel, mel_postnet], file_name=new_name) start = int(round(time.time() * 1000)) wav = audio.inv_mel_spectrogram(mel_postnet) end = int(round(time.time() * 1000)) audio.save_wav(wav, os.path.join("results_kor_0730_nam_95000", new_name2)) clean_text = new_name.replace("_", " ") if check: x, _, _, y, _, _ = WERCER([new_name3], [str(clean_text)]) else: x = 0 y = 0 print("Total time : ", end - start) print() return new_name, x, y
def copy_synthesis(wav_file, out_path): """Perform copy synthesis on the wav file and write the synthesized wav to disk at out_path """ filename = os.path.splitext(os.path.basename(wav_file))[0] y = audio.load_wav(wav_file) if cfg.rescaling: y = y / np.abs(y).max() * cfg.rescaling_max mag = audio.spectrogram(y) y_hat = audio.inv_spectrogram(mag) out_path = os.path.join(out_path, filename + "_synthesized.wav") print(f"Writing {out_path} to disk") audio.save_wav(y_hat, out_path)
def eval_model(global_step, writer, model, checkpoint_dir, ismultispeaker): texts = [ "Scientists at the CERN laboratory say they have discovered a new particle.", "There's a way to measure the acute emotional intelligence that has never gone out of style.", "Generative adversarial network or variational auto-encoder.", ] import synthesis synthesis._frontend = _frontend eval_output_dir = join(checkpoint_dir, "eval") os.makedirs(eval_output_dir, exist_ok=True) speaker_id = 0 if ismultispeaker else None for idx, text in enumerate(texts): signal, alignment, _, mel = synthesis.tts(model, text, p=0, speaker_id=speaker_id, fast=False) signal /= np.max(np.abs(signal)) # Alignment path = join(eval_output_dir, "step{:09d}_text{}_alignment.png".format(global_step, idx)) save_alignment(path, alignment) tag = "eval_averaged_alignment_{}".format(idx) writer.add_image(tag, np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255), global_step) # Mel writer.add_image("(Eval) Predicted mel spectrogram text{}".format(idx), prepare_spec_image(mel), global_step) # Audio path = join(eval_output_dir, "step{:09d}_text{}_predicted.wav".format(global_step, idx)) audio.save_wav(signal, path) try: writer.add_audio("(Eval) Predicted audio signal {}".format(idx), signal, global_step, sample_rate=fs) except Exception as e: warn(str(e)) pass
def eval_model(global_step, device, model, checkpoint_dir, ismultispeaker): # harded coded texts = [ "Scientists at the CERN laboratory say they have discovered a new particle.", "There's a way to measure the acute emotional intelligence that has never gone out of style.", "President Trump met with other leaders at the Group of 20 conference.", "Generative adversarial network or variational auto-encoder.", "Please call Stella.", "Some have accepted this as a miracle without any physical explanation.", ] import synthesis synthesis._frontend = _frontend eval_output_dir = join(checkpoint_dir, "eval") os.makedirs(eval_output_dir, exist_ok=True) # Prepare model for evaluation model_eval = build_model().to(device) model_eval.load_state_dict(model.state_dict()) # hard coded speaker_ids = [0, 1, 10] if ismultispeaker else [None] for speaker_id in speaker_ids: speaker_str = "multispeaker{}".format( speaker_id) if speaker_id is not None else "single" for idx, text in enumerate(texts): signal, alignment, _, mel = synthesis.tts(model_eval, text, p=0, speaker_id=speaker_id, fast=True) signal /= np.max(np.abs(signal)) # Alignment path = join( eval_output_dir, "step{:09d}_text{}_{}_alignment.png".format( global_step, idx, speaker_str)) save_alignment(path, alignment) tag = "eval_averaged_alignment_{}_{}".format(idx, speaker_str) # Audio path = join( eval_output_dir, "step{:09d}_text{}_{}_predicted.wav".format( global_step, idx, speaker_str)) audio.save_wav(signal, path)
def save_states(global_step, attn, linear_outputs, input_lengths, logs_dir): """Save intermediate states """ print(f"Save intermediate states at step {global_step:09d}") idx = min(1, len(input_lengths) - 1) # Alignment # Multi-hop attention if attn is not None and attn.dim() == 4: for i, alignment in enumerate(attn): # Save alignment to disk alignment = alignment[idx].cpu().data.numpy() alignment_dir = join(logs_dir, f"alignment_layer{i + 1}") os.makedirs(alignment_dir, exist_ok=True) path = join(alignment_dir, f"step{global_step:09d}_layer_{i + 1}_alignment.png") save_alignment(path, alignment) # Save averaged alignment alignment_dir = join(logs_dir, "alignment_ave") os.makedirs(alignment_dir, exist_ok=True) path = join(alignment_dir, f"step{global_step:09d}_layer_alignment.png") alignment = attn.mean(0)[idx].cpu().data.numpy() save_alignment(path, alignment) linear_output = linear_outputs[idx].cpu().data.numpy() # Predicted audio signal signal = audio.inv_spectrogram(linear_output.T) signal /= np.max(np.abs(signal)) wavs_dir = join(logs_dir, "wavs") os.makedirs(wavs_dir, exist_ok=True) path = join(wavs_dir, f"step{global_step:09d}_predicted.wav") audio.save_wav(signal, path)
def main(args): model = CNNVocoder(n_heads=hparams.n_heads, layer_channels=hparams.layer_channels, pre_conv_channels=hparams.pre_conv_channels, pre_residuals=hparams.pre_residuals, up_residuals=hparams.up_residuals, post_residuals=hparams.post_residuals) model = model.cuda() model, _, _, _ = load_checkpoint(args.model_path, model) spec = np.load(args.spec_path) spec = torch.FloatTensor(spec).unsqueeze(0).cuda() t1 = time() _, wav = model(spec) dt = time() - t1 print('Synthesized audio in {}s'.format(dt)) wav = wav.data.cpu()[0].numpy() audio.save_wav(wav, args.out_path)
def main(): rate, data = aud.get_wav("violin_4k.wav") print "Original:", data.size new_data = np.copy(data) # select 5 seconds of audio train_data = aud.cut_wav(new_data, 10, 15) print "Train:", train_data.size aud.save_wav(train_data, "violin_train.wav") seed_data = aud.cut_wav(new_data, 16, 17) X, Y = get_data_labels(train_data) seed_X, seed_Y = get_data_labels(seed_data) generated = generate_audio(X, Y, np.array([seed_X[0]])) aud.save_wav(generated, "violin_gen.wav")
def generate(model_path,model_name, generate_path, generate_name, piece): """Synthesize audio from an array of embeddings. Args: encodings: Numpy array with shape [batch_size, time, dim]. save_paths: Iterable of output file names. checkpoint_path: Location of the pretrained model. [model.ckpt-200000] samples_per_save: Save files after every amount of generated samples. """ # Create directory for encoding if os.path.exists(generate_path) is False: os.makedirs(generate_path) net = AutoEncoder() net = load_model(net,model_path,model_name) cuda_available = torch.cuda.is_available() if cuda_available is True: net = net.cuda() net.eval() # Load audio for encoding piece = audio.load_wav(piece) spec = audio.spectrogram(piece).astype(np.float32) spec = torch.from_numpy(spec.T) spec = torch.FloatTensor(spec) spec = torch.unsqueeze(spec, 0) spec = Variable(spec, volatile=True).contiguous() if cuda_available is True: spec = spec.cuda() generated_spec = net(spec) generated_spec = generated_spec.data.cpu().numpy() generated_spec = np.squeeze(generated_spec) waveform = audio.inv_spectrogram(generated_spec.T) wav_name = generate_path + generate_name + '.wav' audio.save_wav(waveform , wav_name)
def synthesis_waveglow(text_seq, model, waveglow, alpha=1.0, mode=""): text = text_to_sequence(text_seq, hp.hparams.text_cleaners) text = text + [0] text = np.stack([np.array(text)]) text = torch.from_numpy(text).long().to(device) pos = torch.stack([torch.Tensor([i + 1 for i in range(text.size(1))])]) pos = pos.long().to(device) model.eval() with torch.no_grad(): _, mel_postnet = model(text, pos, alpha=alpha) with torch.no_grad(): wav = waveglow.infer(mel_postnet, sigma=0.666) print("Wav Have Been Synthesized.") if not os.path.exists("results"): os.mkdir("results") audio.save_wav(wav[0].data.cpu().numpy(), os.path.join("results", text_seq + mode + ".wav"))
def synthesize(mel_sp, save_path): assert len(mel_sp.shape) == 2 mel_sp = np.expand_dims(mel_sp, axis=0) assert mel_sp.shape[1] == hparams.num_mels max_time_frame = mel_sp.shape[2] audio_len = max_time_frame * hparams.hop_size batch = {"c": mel_sp} wavenet = tf.estimator.Estimator( model_fn=wavenet_fn, model_dir=hparams.model_directory, params={ 'feature_columns': tf.feature_column.numeric_column( key="c", shape=[hparams.num_mels, max_time_frame], dtype=tf.float32), 'hparams': hparams, 'time_len': audio_len }) input_fn = tf.estimator.inputs.numpy_input_fn(x=batch, batch_size=1, shuffle=False, num_epochs=1) wavenet_checkpoint = wavenet.latest_checkpoint() wavenet_outputs = wavenet.predict(input_fn=input_fn, checkpoint_path=wavenet_checkpoint) for result in wavenet_outputs: outputs = result['outputs'] if hparams.input_type == "mulaw-quantize": outputs = inv_mulaw_quantize(outputs) save_wav(outputs, save_path, hparams.sample_rate)
def main(): # Target data filename = "120_kmeans_obj.pkl" kmeans = k.load_pkl(filename) spec, label = load_test_data() print("spec", spec.shape) print("label", label.shape) spec_ = np.empty((513, ), np.float32) for i in range(len(label)): spec_ = np.vstack((spec_, kmeans.cluster_centers_[label[i]])) spec_ = np.delete(spec_, 0, 0) print("compare data structure ----") print("spec: ", spec.shape) print("spec_: ", spec_.shape) print("spec data:", spec) print("spec_ data:", spec_) print("min-max spce_ data:", min_max(spec_)) waveform = audio.inv_spectrogram(spec) waveform_ = audio.inv_spectrogram(spec_) waveformmm_ = audio.inv_spectrogram(min_max(spec_)) audio.save_wav(waveform, 'ideal_out.wav') audio.save_wav(waveform_, 'idela_out_.wav') audio.save_wav(waveformmm_, 'idelal_outmm_.wav')