def style_transfer_v2(): audio_paths_ = 'data/examples_filelist_v2.txt' dataloader_ = TextMelLoader(audio_paths_, hparams) datacollate_ = TextMelCollate(1) ## Load data # for file_idx in range(10): # audio_path, text, sid = dataloader_.audiopaths_and_text[file_idx] # print(dict(file_idx=file_idx, audio_path=audio_path, text=text)) file_idx = 8 audio_path, text, sid = dataloader_.audiopaths_and_text[file_idx] print(dict(file_idx=file_idx, audio_path=audio_path, text=text, sid=sid)) # get audio path, encoded text, pitch contour and mel for gst text_encoded = torch.LongTensor( text_to_sequence(text, hparams.text_cleaners, arpabet_dict))[None, :].cuda() pitch_contour = dataloader_[file_idx][3][None].cuda() mel = load_mel(audio_path) # load source data to obtain rhythm using tacotron 2 as a forced aligner x, y = mellotron.parse_batch(datacollate_([dataloader_[file_idx]])) ipd.Audio(audio_path, rate=hparams.sampling_rate) # Style Transfer (Rhythm and Pitch Contour) with torch.no_grad(): # get rhythm (alignment map) using tacotron 2 mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = mellotron.forward( x) rhythm = rhythm.permute(1, 0, 2) speaker_id = next(female_speakers) if np.random.randint(2) else next( male_speakers) speaker_id = torch.LongTensor([speaker_id]).cuda() with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention( (text_encoded, mel, speaker_id, pitch_contour, rhythm)) plot_mel_f0_alignment(x[2].data.cpu().numpy()[0], mel_outputs_postnet.data.cpu().numpy()[0], pitch_contour.data.cpu().numpy()[0, 0], rhythm.data.cpu().numpy()[:, 0].T) plt.show() out_mel = mel_outputs_postnet.data.cpu().numpy()[0] t0 = time.time() # wav = aukit.inv_mel_spectrogram() out_wav = infer_waveform_melgan(out_mel) print(time.time() - t0) aukit.play_audio(out_wav, sr=22050) t0 = time.time() with torch.no_grad(): audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0] ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate) out_wav = audio[0].data.cpu().numpy() print(time.time() - t0) aukit.play_audio(out_wav, sr=22050)
def run_compare(): args = parse_args() load_vocoder_melgan(args.load_path) for i, fname in tqdm(enumerate(args.folder.glob("*.wav"))): wav, sr = librosa.core.load(fname, sr=16000) mel = wav2mel(wav) out = infer_waveform_melgan(mel=mel) aukit.play_audio(wav, sr=sr) aukit.play_audio(out, sr=sr)
def run_tuner(): import aukit from aukit.audio_tuner import tune_speed, tune_pitch inpath = r"hello.wav" aukit.anything2bytes(inpath) aukit.anything2wav(inpath) aukit.anything2bytesio(inpath) bys = tune_speed(inpath, sr=16000, rate=0.5, out_type=None) print(bys) wav = tune_pitch(bys, sr=16000, rate=1, out_type=None) print(wav) aukit.play_audio(wav)
def singing_voice_v2(): # Singing Voice from Music Score data = get_data_from_musicxml('data/sinsy/csongdb_f00002_000_en.musicxml', 132, convert_stress=True) panning = {'Soprano': [-60, -30], 'Alto': [-40, -10], 'Tenor': [30, 60], 'Bass': [10, 40]} n_speakers_per_part = 4 frequency_scaling = 0.4 n_seconds = 90 audio_stereo = np.zeros((hparams.sampling_rate * n_seconds, 2), dtype=np.float32) for i, (part, v) in enumerate(data.items()): rhythm = data[part]['rhythm'].cuda() pitch_contour = data[part]['pitch_contour'].cuda() text_encoded = data[part]['text_encoded'].cuda() for k in range(n_speakers_per_part): pan = k # pan = np.random.randint(panning[part][0], panning[part][1]) if any(x in part.lower() for x in ('soprano', 'alto', 'female')): speaker_id = torch.LongTensor([next(female_speakers)]).cuda() else: speaker_id = torch.LongTensor([next(male_speakers)]).cuda() print("{} MellotronID {} pan {}".format(part, speaker_id.item(), pan)) with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, alignments_transfer = mellotron.inference_noattention( (text_encoded, mel, speaker_id, pitch_contour * frequency_scaling, rhythm)) plot_mel_f0_alignment(mel_outputs_postnet.data.cpu().numpy()[0], mel_outputs_postnet.data.cpu().numpy()[0], pitch_contour.data.cpu().numpy()[0, 0], rhythm.data.cpu().numpy()[:, 0].T) plt.show() out_mel = mel_outputs_postnet.data.cpu().numpy()[0] t0 = time.time() # wav = aukit.inv_mel_spectrogram() out_wav = infer_waveform_melgan(out_mel) print(time.time() - t0) aukit.save_wav(out_wav, "logs/musicxml_melgan_{}.wav".format(time.strftime("%Y%m%d-%H%M%S")), sr=22050) aukit.play_audio(out_wav, sr=22050) t0 = time.time() audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[0, 0] audio = audio.cpu().numpy() audio = panner(audio, pan) print(time.time() - t0) audio_stereo[:audio.shape[0]] += audio write("logs/{} {}.wav".format(part, speaker_id.item()), hparams.sampling_rate, audio) out_wav = audio aukit.play_audio(out_wav, sr=22050)
def style_transfer(): # load source data to obtain rhythm using tacotron 2 as a forced aligner x, y = mellotron.parse_batch(datacollate([dataloader[file_idx]])) ipd.Audio(audio_path, rate=hparams.sampling_rate) # Style Transfer (Rhythm and Pitch Contour) with torch.no_grad(): # get rhythm (alignment map) using tacotron 2 mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = mellotron.forward( x) rhythm = rhythm.permute(1, 0, 2) speaker_id = next(female_speakers) if np.random.randint(2) else next( male_speakers) speaker_id = torch.LongTensor([speaker_id]).cuda() with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention( (text_encoded, mel, speaker_id, pitch_contour, rhythm)) plot_mel_f0_alignment(x[2].data.cpu().numpy()[0], mel_outputs_postnet.data.cpu().numpy()[0], pitch_contour.data.cpu().numpy()[0, 0], rhythm.data.cpu().numpy()[:, 0].T) plt.show() out_mel = mel_outputs_postnet.data.cpu().numpy()[0] # wav = aukit.inv_mel_spectrogram() out_wav = infer_waveform_melgan(out_mel) aukit.play_audio(out_wav, sr=22050) with torch.no_grad(): audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0] ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate) out_wav = audio[0].data.cpu().numpy() aukit.play_audio(out_wav, sr=22050)
fig_path = out_dir.joinpath("demo_{}_{}_fig.jpg".format( cur_time, cur_text)) plot_mel_alignment_gate_audio( spec, align, gate, wav[::msyner.stft.sampling_rate // 1000]) plt.savefig(fig_path) plt.close() yml_path = out_dir.joinpath("demo_{}_{}_info.yml".format( cur_time, cur_text)) info_dict = locals2dict(locals()) with open(yml_path, 'wt', encoding='utf8') as fout: yaml.dump(info_dict, fout, default_flow_style=False, encoding='utf-8', allow_unicode=True) txt_path = out_dir.joinpath("info_dict.txt".format(cur_time)) with open(txt_path, 'at', encoding='utf8') as fout: fout.write('{}\n'.format( json.dumps(info_dict, ensure_ascii=False))) num_generated += 1 # print("\nSaved output as %s\n\n" % out_path) if args.play: aukit.play_audio(out_path, sr=msyner.stft.sampling_rate) except Exception as e: print("Caught exception: %s" % repr(e)) print("Restarting\n") traceback.print_exc()
melgan_hparams = {} melgan_hparams.update({k: v for k, v in default_hparams.items()}) melgan_hparams.update(my_hp) melgan_hparams = Dict2Obj(melgan_hparams) _pad_len = (default_hparams.n_fft - default_hparams.hop_size) // 2 def melspectrogram(wav, hparams=None): wav = np.pad(wav.flatten(), (_pad_len, _pad_len), mode="reflect") mel = mel_spectrogram(wav, melgan_hparams) mel = mel / 20 return mel def inv_melspectrogram(mel, hparams=None): mel = mel * 20 wav = inv_mel_spectrogram(mel, melgan_hparams) return wav if __name__ == "__main__": import aukit inpath = r"E:\data\temp\01.wav" wav = load_wav(inpath, sr=16000) mel = melspectrogram(wav) out = inv_melspectrogram(mel) aukit.play_audio(wav) aukit.play_audio(out)
def run_player(): import aukit inpath = Path(r"E:\data\aliaudio\examples\ali_Aibao_000001.wav") wav = aukit.load_wav(inpath, sr=16000) wav = aukit.change_voice(wav, mode="assign_pitch", alpha=200) aukit.play_audio(wav, volume=0.5)
def run_noise_remover(): import aukit inpath = r"hello.wav" wav = aukit.load_wav(inpath) out = aukit.remove_noise(wav) aukit.play_audio(out)
# plt.imsave(fpath, spec) plt.pcolor(spec) plt.colorbar() plt.savefig(fpath) plt.close() fpath = args.out_dir.joinpath( "demo_out_{}_alignment.jpg".format(cur_time)) plt.pcolor(align) plt.colorbar() plt.savefig(fpath) plt.close() txt_path = args.out_dir.joinpath("info_dict.txt".format(cur_time)) with open(txt_path, 'at', encoding='utf8') as fout: dt = dict(text=text, audio_path=str(fpath), speaker=speaker, time=cur_time) out = json.dumps(dt, ensure_ascii=False) fout.write('{}\n'.format(out)) num_generated += 1 print("\nSaved output as %s\n\n" % outpath) if args.play: aukit.play_audio(fpath, sr=16000) except Exception as e: print("Caught exception: %s" % repr(e)) print("Restarting\n") traceback.print_exc()
spec = msyner.synthesize(text=text, speaker=speaker) # spec, align = synthesize_one(text, speaker=speaker, with_alignment=True, # hparams=_hparams, encoder_fpath=args.encoder_model_fpath) print("Spectrogram shape: {}".format(spec.shape)) # print("Alignment shape: {}".format(align.shape)) ## Generating the waveform print("Synthesizing the waveform ...") wav = griffinlim_vocoder(spec) print("Waveform shape: {}".format(wav.shape)) # Save it on the disk cur_time = time.strftime('%Y%m%d_%H%M%S') fpath = args.out_dir.joinpath("demo_out_{}.wav".format(cur_time)) # librosa.output.write_wav(fpath, generated_wav.astype(np.float32), synthesizer.sample_rate) aukit.save_wav(wav, fpath, sr=_hparams.sampling_rate) # save txt_path = args.out_dir.joinpath("info_dict.txt".format(cur_time)) with open(txt_path, 'at', encoding='utf8') as fout: dt = dict(text=text, audio_path=str(fpath), speaker=speaker, time=cur_time) out = json.dumps(dt, ensure_ascii=False) fout.write('{}\n'.format(out)) num_generated += 1 print("\nSaved output as %s\n\n" % fpath) if args.play: aukit.play_audio(fpath, sr=_hparams.sampling_rate) except Exception as e: print("Caught exception: %s" % repr(e)) print("Restarting\n") traceback.print_exc()
def run_noise_remover(): import aukit inpath = r"E:\data\temp\01.wav" wav = aukit.load_wav(inpath) out = aukit.remove_noise(wav) aukit.play_audio(out)