Ejemplo n.º 1
0
    def test_fastspeech(self):
        neural_factory = nemo.core.NeuralModuleFactory(
            backend=nemo.core.Backend.PyTorch,
            local_rank=None,
            create_tb_writer=False,
        )

        data_layer = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=1,
            shuffle=False,
            sample_rate=16000,
        )

        data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
            window_size=None,
            window_stride=None,
            n_window_size=512,
            n_window_stride=128,
            normalize=None,
            preemph=None,
            dither=0,
            mag_power=1.0,
            pad_value=-11.52,
            pad_to=0,
        )

        data = data_layer()
        spec, spec_length = data_preprocessor(input_signal=data.audio_signal,
                                              length=data.a_sig_length)

        # Creates and saves durations as numpy arrays.
        durs_dir = pathlib.Path('tests/data/asr/durs')
        durs_dir.mkdir(exist_ok=True)
        result = neural_factory.infer(
            [data.transcripts, data.transcript_length, spec_length, spec])
        k = -1
        for text, text_len, mel_len, mel in zip(result[0], result[1],
                                                result[2], result[3]):
            text = text.cpu().numpy()[0][:text_len.cpu().numpy()[0]]
            dur = np.zeros(text.shape[0], dtype=np.long)
            dur_sum = mel_len.cpu().numpy()[0] + 1  # TODO: delete `+1`
            dur[0] = dur_sum - 4
            dur[1] = 4
            k += 1
            np.save(durs_dir / f'{k}.npy', dur, allow_pickle=False)

        data_layer = nemo_tts.FastSpeechDataLayer(
            manifest_filepath=self.manifest_filepath,
            durs_dir=durs_dir,
            labels=self.labels,
            batch_size=4,
            sample_rate=16000,
        )

        fastspeech = nemo_tts.FastSpeech(
            decoder_output_size=384,
            n_mels=64,
            max_seq_len=2048,
            word_vec_dim=384,
            encoder_n_layer=6,
            encoder_head=2,
            encoder_conv1d_filter_size=1536,
            decoder_n_layer=6,
            decoder_head=2,
            decoder_conv1d_filter_size=1536,
            fft_conv1d_kernel=3,
            fft_conv1d_padding=1,
            encoder_output_size=384,
            duration_predictor_filter_size=256,
            duration_predictor_kernel_size=3,
            dropout=0.1,
            alpha=1.0,
            n_src_vocab=len(self.labels),
            pad_id=0,
        )

        loss = nemo_tts.FastSpeechLoss()

        data = data_layer()
        mel_true, _ = data_preprocessor(input_signal=data.audio,
                                        length=data.audio_len)
        mel_pred, dur_pred = fastspeech(
            text=data.text,
            text_pos=data.text_pos,
            mel_true=mel_true,
            dur_true=data.dur_true,
        )
        loss_t = loss(
            mel_true=mel_true,
            mel_pred=mel_pred,
            dur_true=data.dur_true,
            dur_pred=dur_pred,
            text_pos=data.text_pos,
        )

        callback = nemo.core.SimpleLossLoggerCallback(
            tensors=[loss_t],
            print_func=lambda x: logging.info(f'Train Loss: {str(x[0].item())}'
                                              ),
        )
        optimizer = neural_factory.get_trainer()
        optimizer.train(
            [loss_t],
            callbacks=[callback],
            optimizer="sgd",
            optimization_params={
                "num_epochs": 3,
                "lr": 0.0003
            },
        )
Ejemplo n.º 2
0
    def test_fastspeech(self):
        """Integtaion test that instantiates a FastSpeech model and tests training with the sample asr data.
        Note instantiating the FastSpeech model additionally requires creating speech durations which additionally
        tests NeuralModuleFactory.infer().
        Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss
        at the first step.
        """
        data_layer = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=1,
            shuffle=False,
            sample_rate=16000,
        )

        data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
            window_size=None,
            window_stride=None,
            n_window_size=512,
            n_window_stride=128,
            normalize=None,
            preemph=None,
            dither=0,
            mag_power=1.0,
            pad_value=-11.52,
            pad_to=0,
            log_zero_guard_type="clamp",
            log_zero_guard_value=1e-05,
        )

        data = data_layer()
        spec, spec_length = data_preprocessor(input_signal=data.audio_signal, length=data.a_sig_length)

        # Creates and saves durations as numpy arrays.
        durs_dir = pathlib.Path('tests/data/asr/durs')
        durs_dir.mkdir(exist_ok=True)
        result = self.nf.infer([data.transcripts, data.transcript_length, spec_length, spec])
        k = -1
        for text, text_len, mel_len, mel in zip(result[0], result[1], result[2], result[3]):
            text = text.cpu().numpy()[0][: text_len.cpu().numpy()[0]]
            dur = np.zeros(text.shape[0], dtype=np.long)
            dur_sum = mel_len.cpu().numpy()[0] + 1  # TODO: delete `+1`
            dur[0] = dur_sum - 4
            dur[1] = 4
            k += 1
            np.save(durs_dir / f'{k}.npy', dur, allow_pickle=False)

        data_layer = nemo_tts.FastSpeechDataLayer(
            manifest_filepath=self.manifest_filepath,
            durs_dir=durs_dir,
            labels=self.labels,
            batch_size=4,
            sample_rate=16000,
        )

        fastspeech = nemo_tts.FastSpeech(
            decoder_output_size=384,
            n_mels=64,
            max_seq_len=2048,
            word_vec_dim=384,
            encoder_n_layer=6,
            encoder_head=2,
            encoder_conv1d_filter_size=1536,
            decoder_n_layer=6,
            decoder_head=2,
            decoder_conv1d_filter_size=1536,
            fft_conv1d_kernel=3,
            fft_conv1d_padding=1,
            encoder_output_size=384,
            duration_predictor_filter_size=256,
            duration_predictor_kernel_size=3,
            dropout=0.1,
            alpha=1.0,
            n_src_vocab=len(self.labels),
            pad_id=0,
        )

        loss = nemo_tts.FastSpeechLoss()

        data = data_layer()
        mel_true, _ = data_preprocessor(input_signal=data.audio, length=data.audio_len)
        mel_pred, dur_pred = fastspeech(
            text=data.text, text_pos=data.text_pos, mel_true=mel_true, dur_true=data.dur_true,
        )
        loss_t = loss(
            mel_true=mel_true, mel_pred=mel_pred, dur_true=data.dur_true, dur_pred=dur_pred, text_pos=data.text_pos,
        )

        loss_list = []
        callback = SimpleLossLoggerCallback(
            tensors=[loss_t], print_func=partial(self.print_and_log_loss, loss_log_list=loss_list), step_freq=1
        )
        # Instantiate an optimizer to perform `train` action
        optimizer = PtActions()
        optimizer.train(
            [loss_t], callbacks=[callback], optimizer="sgd", optimization_params={"max_steps": 3, "lr": 0.0003}
        )

        # Assert that training loss went down
        assert loss_list[-1] < loss_list[0]