Ejemplo n.º 1
0
    def test_waveglow(self, tmpdir, df_type):
        url = "https://api.ngc.nvidia.com/v2/models/nvidia/waveglow_ljspeech/versions/2/files/WaveGlowNM.pt"
        ptfile = "./WaveGlowNM.pt"
        if not Path(ptfile).is_file():
            urllib.request.urlretrieve(url, ptfile)

        module = nemo_tts.WaveGlowInferNM(sample_rate=22050)
        module.restore_from(ptfile)
        module.eval()

        torch.manual_seed(1)
        mel = torch.randn(1, 80, 96).cuda()

        input_example = OrderedDict([("mel_spectrogram", mel)])
        tmp_file_name = str(tmpdir.mkdir("export").join("waveglow"))

        self.__test_export_route(module=module, out_name=tmp_file_name, mode=df_type, input_example=input_example)
Ejemplo n.º 2
0
def main():
    args = parse_args()
    neural_factory = nemo.core.NeuralModuleFactory(
        optimization_level=args.amp_opt_level,
        backend=nemo.core.Backend.PyTorch,
        local_rank=args.local_rank,
    )

    use_cache = True
    if args.local_rank is not None:
        print("Doing ALL GPU")
        use_cache = False

    # Create text to spectrogram model
    if args.spec_model == "tacotron2":
        yaml = YAML(typ="safe")
        with open(args.spec_model_config) as file:
            tacotron2_params = yaml.load(file)
        spec_neural_modules = create_NMs(tacotron2_params, decoder_infer=True)
        infer_tensors = create_infer_dags(
            neural_factory=neural_factory,
            neural_modules=spec_neural_modules,
            tacotron2_params=tacotron2_params,
            infer_dataset=args.eval_dataset,
            infer_batch_size=args.batch_size,
        )

    print("Running Tacotron 2")
    # Run tacotron 2
    evaluated_tensors = neural_factory.infer(
        tensors=infer_tensors,
        checkpoint_dir=args.spec_model_load_dir,
        cache=use_cache,
        offload_to_cpu=False,
    )
    mel_len = evaluated_tensors[-1]
    print("Done Running Tacotron 2")
    filterbank = librosa.filters.mel(
        sr=tacotron2_params["sample_rate"],
        n_fft=tacotron2_params["n_fft"],
        n_mels=tacotron2_params["n_mels"],
        fmax=tacotron2_params["fmax"],
    )

    if args.vocoder == "griffin-lim":
        print("Running Griffin-Lim")
        mel_spec = evaluated_tensors[0]
        for i, batch in enumerate(mel_spec):
            log_mel = batch.cpu().numpy().transpose(0, 2, 1)
            mel = np.exp(log_mel)
            magnitudes = np.dot(mel, filterbank) * args.griffin_lim_mag_scale
            for j, sample in enumerate(magnitudes):
                sample = sample[:mel_len[i][j], :]
                audio = griffin_lim(sample.T**args.griffin_lim_power)
                save_file = f"sample_{i * 32 + j}.wav"
                if args.save_dir:
                    save_file = os.path.join(args.save_dir, save_file)
                write(save_file, tacotron2_params["sample_rate"], audio)
                plot_and_save_spec(log_mel[j][:mel_len[i][j], :].T, i * 32 + j,
                                   args.save_dir)

    elif args.vocoder == "waveglow":
        (mel_pred, _, _, _) = infer_tensors
        if not args.vocoder_model_config or not args.vocoder_model_load_dir:
            raise ValueError(
                "Using waveglow as the vocoder requires the "
                "--vocoder_model_config and --vocoder_model_load_dir args")

        yaml = YAML(typ="safe")
        with open(args.vocoder_model_config) as file:
            waveglow_params = yaml.load(file)
        waveglow = nemo_tts.WaveGlowInferNM(sigma=args.waveglow_sigma,
                                            **waveglow_params["WaveGlowNM"])
        audio_pred = waveglow(mel_spectrogram=mel_pred)
        # waveglow.restore_from(args.vocoder_model_load_dir)

        # Run waveglow
        print("Running Waveglow")
        evaluated_tensors = neural_factory.infer(
            tensors=[audio_pred],
            checkpoint_dir=args.vocoder_model_load_dir,
            # checkpoint_dir=None,
            modules_to_restore=[waveglow],
            use_cache=use_cache,
        )
        print("Done Running Waveglow")

        if args.waveglow_denoiser_strength > 0:
            print("Setup denoiser")
            waveglow.setup_denoiser()

        print("Saving results to disk")
        for i, batch in enumerate(evaluated_tensors[0]):
            audio = batch.cpu().numpy()
            for j, sample in enumerate(audio):
                sample_len = mel_len[i][j] * tacotron2_params["n_stride"]
                sample = sample[:sample_len]
                save_file = f"sample_{i * 32 + j}.wav"
                if args.save_dir:
                    save_file = os.path.join(args.save_dir, save_file)
                if args.waveglow_denoiser_strength > 0:
                    sample, spec = waveglow.denoise(
                        sample, strength=args.waveglow_denoiser_strength)
                else:
                    spec, _ = librosa.core.magphase(
                        librosa.core.stft(sample,
                                          n_fft=waveglow_params["n_fft"]))
                write(save_file, waveglow_params["sample_rate"], sample)
                spec = np.dot(filterbank, spec)
                spec = np.log(np.clip(spec, a_min=1e-5, a_max=None))
                plot_and_save_spec(spec, i * 32 + j, args.save_dir)