Ejemplo n.º 1
0
def create_eval_dags(
    neural_factory, neural_modules, waveglow_params, eval_datasets, eval_batch_size, eval_freq, cpu_per_dl=1,
):
    data_preprocessor, waveglow, _ = neural_modules

    eval_dl_params = copy.deepcopy(waveglow_params["AudioDataLayer"])
    eval_dl_params.update(waveglow_params["AudioDataLayer"]["eval"])
    del eval_dl_params["train"]
    del eval_dl_params["eval"]

    callbacks = []
    # assemble eval DAGs
    for eval_dataset in eval_datasets:
        data_layer_eval = nemo_tts.AudioDataLayer(
            manifest_filepath=eval_dataset, batch_size=eval_batch_size, num_workers=cpu_per_dl, **eval_dl_params,
        )

        audio, audio_len, = data_layer_eval()
        spec_target, spec_target_len = data_preprocessor(input_signal=audio, length=audio_len)

        audio_pred, log_s_list, log_det_W_list = waveglow(mel_spectrogram=spec_target, audio=audio)

        # create corresponding eval callback
        tagname = os.path.basename(eval_dataset).split(".")[0]
        eval_callback = nemo.core.EvaluatorCallback(
            eval_tensors=[audio_pred, spec_target, spec_target_len],
            user_iter_callback=waveglow_process_eval_batch,
            user_epochs_done_callback=lambda x: x,
            tb_writer_func=partial(waveglow_eval_log_to_tb_func, tag=tagname, mel_fb=data_preprocessor.filter_banks,),
            eval_step=eval_freq,
            tb_writer=neural_factory.tb_writer,
        )

        callbacks.append(eval_callback)
    return callbacks
Ejemplo n.º 2
0
    def test_waveglow_training(self):
        data_layer = nemo_tts.AudioDataLayer(
            manifest_filepath=self.manifest_filepath,
            n_segments=4000,
            batch_size=4,
        )
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            window_size=None,
            window_stride=None,
            n_window_size=512,
            n_window_stride=128,
            normalize=None,
            preemph=None,
            dither=0,
            mag_power=1.0,
            pad_value=-11.52,
        )
        waveglow = nemo_tts.WaveGlowNM(
            n_mel_channels=64,
            n_flows=6,
            n_group=4,
            n_early_every=4,
            n_early_size=2,
            n_wn_layers=4,
            n_wn_channels=256,
            wn_kernel_size=3,
        )
        waveglow_loss = nemo_tts.WaveGlowLoss()

        # DAG
        audio, audio_len, = data_layer()
        spec_target, _ = preprocessing(input_signal=audio, length=audio_len)

        z, log_s_list, log_det_W_list = waveglow(mel_spectrogram=spec_target,
                                                 audio=audio)
        loss_t = waveglow_loss(z=z,
                               log_s_list=log_s_list,
                               log_det_W_list=log_det_W_list)

        callback = nemo.core.SimpleLossLoggerCallback(
            tensors=[loss_t],
            print_func=lambda x: logging.info(f'Train Loss: {str(x[0].item())}'
                                              ),
        )
        # Instantiate an optimizer to perform `train` action
        neural_factory = nemo.core.NeuralModuleFactory(
            backend=nemo.core.Backend.PyTorch,
            local_rank=None,
            create_tb_writer=False,
        )
        optimizer = neural_factory.get_trainer()
        optimizer.train(
            [loss_t],
            callbacks=[callback],
            optimizer="sgd",
            optimization_params={
                "num_epochs": 10,
                "lr": 0.0003
            },
        )
Ejemplo n.º 3
0
    def test_waveglow_training(self):
        """Integtaion test that instantiates a smaller WaveGlow model and tests training with the sample asr data.
        Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss
        at the first step.
        """
        data_layer = nemo_tts.AudioDataLayer(
            manifest_filepath=self.manifest_filepath, n_segments=4000, batch_size=4, sample_rate=16000
        )
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            window_size=None,
            window_stride=None,
            n_window_size=512,
            n_window_stride=128,
            normalize=None,
            preemph=None,
            dither=0,
            mag_power=1.0,
            pad_value=-11.52,
        )
        waveglow = nemo_tts.WaveGlowNM(
            n_mel_channels=64,
            n_flows=6,
            n_group=4,
            n_early_every=4,
            n_early_size=2,
            n_wn_layers=4,
            n_wn_channels=256,
            wn_kernel_size=3,
            sample_rate=16000,
        )
        waveglow_loss = nemo_tts.WaveGlowLoss(sample_rate=16000)

        # DAG
        audio, audio_len, = data_layer()
        spec_target, _ = preprocessing(input_signal=audio, length=audio_len)

        z, log_s_list, log_det_W_list = waveglow(mel_spectrogram=spec_target, audio=audio)
        loss_t = waveglow_loss(z=z, log_s_list=log_s_list, log_det_W_list=log_det_W_list)

        loss_list = []
        callback = SimpleLossLoggerCallback(
            tensors=[loss_t], print_func=partial(self.print_and_log_loss, loss_log_list=loss_list), step_freq=1
        )
        # Instantiate an optimizer to perform `train` action
        optimizer = PtActions()
        optimizer.train(
            [loss_t], callbacks=[callback], optimizer="sgd", optimization_params={"max_steps": 3, "lr": 0.01}
        )

        # Assert that training loss went down
        assert loss_list[-1] < loss_list[0]
Ejemplo n.º 4
0
def create_train_dag(
    neural_factory,
    neural_modules,
    waveglow_params,
    train_dataset,
    batch_size,
    checkpoint_save_freq,
    cpu_per_dl=1,
):
    data_preprocessor, waveglow, waveglow_loss = neural_modules

    train_dl_params = copy.deepcopy(waveglow_params["AudioDataLayer"])
    train_dl_params.update(waveglow_params["AudioDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]

    data_layer = nemo_tts.AudioDataLayer(
        manifest_filepath=train_dataset,
        batch_size=batch_size,
        num_workers=cpu_per_dl,
        **train_dl_params,
    )

    N = len(data_layer)
    steps_per_epoch = int(N / (batch_size * neural_factory.world_size))
    logging.info('Have {0} examples to train on.'.format(N))

    # Train DAG
    audio, audio_len, = data_layer()
    spec_target, spec_target_len = data_preprocessor(input_signal=audio,
                                                     length=audio_len)

    z, log_s_list, log_det_W_list = waveglow(mel_spectrogram=spec_target,
                                             audio=audio)
    loss_t = waveglow_loss(z=z,
                           log_s_list=log_s_list,
                           log_det_W_list=log_det_W_list)

    # Callbacks needed to print info to console and Tensorboard
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[loss_t, z, spec_target, spec_target_len],
        print_func=lambda x: logging.info(f"Loss: {x[0].data}"),
        log_to_tb_func=partial(waveglow_log_to_tb_func, log_images=False),
        tb_writer=neural_factory.tb_writer,
    )

    chpt_callback = nemo.core.CheckpointCallback(
        folder=neural_factory.checkpoint_dir, step_freq=checkpoint_save_freq)

    callbacks = [train_callback, chpt_callback]
    return loss_t, callbacks, steps_per_epoch