def test_waveglow(self, tmpdir, df_type): url = "https://api.ngc.nvidia.com/v2/models/nvidia/waveglow_ljspeech/versions/2/files/WaveGlowNM.pt" ptfile = "./WaveGlowNM.pt" if not Path(ptfile).is_file(): urllib.request.urlretrieve(url, ptfile) module = nemo_tts.WaveGlowInferNM(sample_rate=22050) module.restore_from(ptfile) module.eval() torch.manual_seed(1) mel = torch.randn(1, 80, 96).cuda() input_example = OrderedDict([("mel_spectrogram", mel)]) tmp_file_name = str(tmpdir.mkdir("export").join("waveglow")) self.__test_export_route(module=module, out_name=tmp_file_name, mode=df_type, input_example=input_example)
def main(): args = parse_args() neural_factory = nemo.core.NeuralModuleFactory( optimization_level=args.amp_opt_level, backend=nemo.core.Backend.PyTorch, local_rank=args.local_rank, ) use_cache = True if args.local_rank is not None: print("Doing ALL GPU") use_cache = False # Create text to spectrogram model if args.spec_model == "tacotron2": yaml = YAML(typ="safe") with open(args.spec_model_config) as file: tacotron2_params = yaml.load(file) spec_neural_modules = create_NMs(tacotron2_params, decoder_infer=True) infer_tensors = create_infer_dags( neural_factory=neural_factory, neural_modules=spec_neural_modules, tacotron2_params=tacotron2_params, infer_dataset=args.eval_dataset, infer_batch_size=args.batch_size, ) print("Running Tacotron 2") # Run tacotron 2 evaluated_tensors = neural_factory.infer( tensors=infer_tensors, checkpoint_dir=args.spec_model_load_dir, cache=use_cache, offload_to_cpu=False, ) mel_len = evaluated_tensors[-1] print("Done Running Tacotron 2") filterbank = librosa.filters.mel( sr=tacotron2_params["sample_rate"], n_fft=tacotron2_params["n_fft"], n_mels=tacotron2_params["n_mels"], fmax=tacotron2_params["fmax"], ) if args.vocoder == "griffin-lim": print("Running Griffin-Lim") mel_spec = evaluated_tensors[0] for i, batch in enumerate(mel_spec): log_mel = batch.cpu().numpy().transpose(0, 2, 1) mel = np.exp(log_mel) magnitudes = np.dot(mel, filterbank) * args.griffin_lim_mag_scale for j, sample in enumerate(magnitudes): sample = sample[:mel_len[i][j], :] audio = griffin_lim(sample.T**args.griffin_lim_power) save_file = f"sample_{i * 32 + j}.wav" if args.save_dir: save_file = os.path.join(args.save_dir, save_file) write(save_file, tacotron2_params["sample_rate"], audio) plot_and_save_spec(log_mel[j][:mel_len[i][j], :].T, i * 32 + j, args.save_dir) elif args.vocoder == "waveglow": (mel_pred, _, _, _) = infer_tensors if not args.vocoder_model_config or not args.vocoder_model_load_dir: raise ValueError( "Using waveglow as the vocoder requires the " "--vocoder_model_config and --vocoder_model_load_dir args") yaml = YAML(typ="safe") with open(args.vocoder_model_config) as file: waveglow_params = yaml.load(file) waveglow = nemo_tts.WaveGlowInferNM(sigma=args.waveglow_sigma, **waveglow_params["WaveGlowNM"]) audio_pred = waveglow(mel_spectrogram=mel_pred) # waveglow.restore_from(args.vocoder_model_load_dir) # Run waveglow print("Running Waveglow") evaluated_tensors = neural_factory.infer( tensors=[audio_pred], checkpoint_dir=args.vocoder_model_load_dir, # checkpoint_dir=None, modules_to_restore=[waveglow], use_cache=use_cache, ) print("Done Running Waveglow") if args.waveglow_denoiser_strength > 0: print("Setup denoiser") waveglow.setup_denoiser() print("Saving results to disk") for i, batch in enumerate(evaluated_tensors[0]): audio = batch.cpu().numpy() for j, sample in enumerate(audio): sample_len = mel_len[i][j] * tacotron2_params["n_stride"] sample = sample[:sample_len] save_file = f"sample_{i * 32 + j}.wav" if args.save_dir: save_file = os.path.join(args.save_dir, save_file) if args.waveglow_denoiser_strength > 0: sample, spec = waveglow.denoise( sample, strength=args.waveglow_denoiser_strength) else: spec, _ = librosa.core.magphase( librosa.core.stft(sample, n_fft=waveglow_params["n_fft"])) write(save_file, waveglow_params["sample_rate"], sample) spec = np.dot(filterbank, spec) spec = np.log(np.clip(spec, a_min=1e-5, a_max=None)) plot_and_save_spec(spec, i * 32 + j, args.save_dir)