Beispiel #1
0
    def test_waveglow_training(self):
        """Integtaion test that instantiates a smaller WaveGlow model and tests training with the sample asr data.
        Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss
        at the first step.
        """
        data_layer = nemo_tts.AudioDataLayer(
            manifest_filepath=self.manifest_filepath, n_segments=4000, batch_size=4, sample_rate=16000
        )
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            window_size=None,
            window_stride=None,
            n_window_size=512,
            n_window_stride=128,
            normalize=None,
            preemph=None,
            dither=0,
            mag_power=1.0,
            pad_value=-11.52,
        )
        waveglow = nemo_tts.WaveGlowNM(
            n_mel_channels=64,
            n_flows=6,
            n_group=4,
            n_early_every=4,
            n_early_size=2,
            n_wn_layers=4,
            n_wn_channels=256,
            wn_kernel_size=3,
            sample_rate=16000,
        )
        waveglow_loss = nemo_tts.WaveGlowLoss(sample_rate=16000)

        # DAG
        audio, audio_len, = data_layer()
        spec_target, _ = preprocessing(input_signal=audio, length=audio_len)

        z, log_s_list, log_det_W_list = waveglow(mel_spectrogram=spec_target, audio=audio)
        loss_t = waveglow_loss(z=z, log_s_list=log_s_list, log_det_W_list=log_det_W_list)

        loss_list = []
        callback = SimpleLossLoggerCallback(
            tensors=[loss_t], print_func=partial(self.print_and_log_loss, loss_log_list=loss_list), step_freq=1
        )
        # Instantiate an optimizer to perform `train` action
        optimizer = PtActions()
        optimizer.train(
            [loss_t], callbacks=[callback], optimizer="sgd", optimization_params={"max_steps": 3, "lr": 0.01}
        )

        # Assert that training loss went down
        assert loss_list[-1] < loss_list[0]
Beispiel #2
0
    with NeuralGraph(operation_mode=OperationMode.training) as training_graph:
        _, img, _, _, fine_target, _ = cifar100_dl()
        feat_map = image_encoder(inputs=img)
        res_img = reshaper(inputs=feat_map)
        logits = ffn(inputs=res_img)
        pred = nl(inputs=logits)
        loss = nll_loss(predictions=pred, targets=fine_target)
        # Set output - that output will be used for training.
        training_graph.outputs["loss"] = loss

    # Freeze the pretrained encoder.
    training_graph.freeze(["vgg16"])
    logging.info(training_graph.summary())

    # SimpleLossLoggerCallback will print loss values to console.
    callback = SimpleLossLoggerCallback(
        tensors=[loss],
        print_func=lambda x: logging.info(f'Training Loss: {str(x[0].item())}'
                                          ))

    # Invoke the "train" action.
    nf.train(
        training_graph=training_graph,
        callbacks=[callback],
        optimization_params={
            "num_epochs": 10,
            "lr": 0.001
        },
        optimizer="adam",
    )
Beispiel #3
0
            data.noncategorical_slot_status,
        ]

    steps_per_epoch = math.ceil(
        len(datalayer) / (args.train_batch_size * args.num_gpus))
    return steps_per_epoch, tensors


steps_per_epoch, train_tensors = create_pipeline(dataset_split='train')
logging.info(f'Steps per epoch: {steps_per_epoch}')

# Create trainer and execute training action
train_callback = SimpleLossLoggerCallback(
    tensors=train_tensors,
    print_func=lambda x: logging.info("Loss: {:.8f}".format(x[0].item())),
    get_tb_values=lambda x: [["loss", x[0]]],
    tb_writer=nf.tb_writer,
    step_freq=args.loss_log_freq
    if args.loss_log_freq > 0 else steps_per_epoch,
)


def get_eval_callback(eval_dataset):
    _, eval_tensors = create_pipeline(dataset_split=eval_dataset)
    eval_callback = EvaluatorCallback(
        eval_tensors=eval_tensors,
        user_iter_callback=lambda x, y: eval_iter_callback(
            x, y, schema_preprocessor, eval_dataset),
        user_epochs_done_callback=lambda x: eval_epochs_done_callback(
            x,
            args.task_name,
            eval_dataset,
Beispiel #4
0
    is_training=True,
    num_gpus=args.num_gpus,
)
eval_tensors, _, _, eval_data_layer = create_pipeline(
    num_samples=args.num_eval_samples,
    batch_size=args.batch_size,
    data_prefix=args.eval_file_prefix,
    is_training=False,
    num_gpus=args.num_gpus,
)

# Create callbacks for train and eval modes
train_callback = SimpleLossLoggerCallback(
    tensors=train_tensors,
    print_func=lambda x: logging.info(str(round(x[0].item(), 3))),
    tb_writer=nf.tb_writer,
    get_tb_values=lambda x: [["loss", x[0]]],
    step_freq=train_steps_per_epoch,
)

eval_callback = nemo.core.EvaluatorCallback(
    eval_tensors=eval_tensors,
    user_iter_callback=lambda x, y: eval_iter_callback(x, y),
    user_epochs_done_callback=lambda x: eval_epochs_done_callback(
        x,
        intents_label_ids=data_desc.intents_label_ids,
        slots_label_ids=data_desc.slots_label_ids,
        graph_fold=f'{nf.work_dir}/graphs',
        normalize_cm=True,
    ),
    tb_writer=nf.tb_writer,
    def test_stft_conv_training(self):
        """Integtaion test that instantiates a small Jasper model and tests training with the sample asr data.
        test_stft_conv_training tests the torch_stft path while test_jasper_training tests the torch.stft path inside
        of AudioToMelSpectrogramPreprocessor.
        Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss
        at the first step.
        Note: Training is done with batch gradient descent as opposed to stochastic gradient descent due to CTC loss
        """
        with open(
                os.path.abspath(
                    os.path.join(os.path.dirname(__file__),
                                 "../data/jasper_smaller.yaml"))) as file:
            jasper_model_definition = self.yaml.load(file)
        dl = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=30)
        pre_process_params = {
            'frame_splicing': 1,
            'features': 64,
            'window_size': 0.02,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
            'stft_conv': True,
        }
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            **pre_process_params)
        jasper_encoder = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition[
                'AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition['JasperEncoder'],
        )
        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024,
                                                      num_classes=len(
                                                          self.labels))

        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))

        # DAG
        audio_signal, a_sig_length, transcript, transcript_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal,
                                                   length=a_sig_length)

        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal,
                                              length=p_length)
        # logging.info(jasper_encoder)
        log_probs = jasper_decoder(encoder_output=encoded)
        loss = ctc_loss(
            log_probs=log_probs,
            targets=transcript,
            input_length=encoded_len,
            target_length=transcript_len,
        )

        loss_list = []
        callback = SimpleLossLoggerCallback(tensors=[loss],
                                            print_func=partial(
                                                self.print_and_log_loss,
                                                loss_log_list=loss_list),
                                            step_freq=1)

        self.nf.train(
            [loss],
            callbacks=[callback],
            optimizer="sgd",
            optimization_params={
                "max_steps": 3,
                "lr": 0.001
            },
        )
        self.nf.reset_trainer()

        # Assert that training loss went down
        assert loss_list[-1] < loss_list[0]
    def test_contextnet_ctc_training(self):
        """Integtaion test that instantiates a small ContextNet model and tests training with the sample asr data.
        Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss
        at the first step.
        Note: Training is done with batch gradient descent as opposed to stochastic gradient descent due to CTC loss
        Checks SE-block with fixed context size and global context, residual_mode='stride_add' and 'stride_last' flags
        """
        with open(
                os.path.abspath(
                    os.path.join(os.path.dirname(__file__),
                                 "../data/contextnet_32.yaml"))) as f:
            contextnet_model_definition = self.yaml.load(f)
        dl = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=30)
        pre_process_params = {
            'frame_splicing': 1,
            'features': 80,
            'window_size': 0.025,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
        }
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            **pre_process_params)

        spec_aug = nemo_asr.SpectrogramAugmentation(
            **contextnet_model_definition['SpectrogramAugmentation'])

        contextnet_encoder = nemo_asr.ContextNetEncoder(
            feat_in=contextnet_model_definition[
                'AudioToMelSpectrogramPreprocessor']['features'],
            **contextnet_model_definition['ContextNetEncoder'],
        )
        contextnet_decoder = nemo_asr.ContextNetDecoderForCTC(feat_in=32,
                                                              hidden_size=16,
                                                              num_classes=len(
                                                                  self.labels))
        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))

        # DAG
        audio_signal, a_sig_length, transcript, transcript_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal,
                                                   length=a_sig_length)

        processed_signal = spec_aug(input_spec=processed_signal)

        encoded, encoded_len = contextnet_encoder(
            audio_signal=processed_signal, length=p_length)
        log_probs = contextnet_decoder(encoder_output=encoded)
        loss = ctc_loss(
            log_probs=log_probs,
            targets=transcript,
            input_length=encoded_len,
            target_length=transcript_len,
        )

        loss_list = []
        callback = SimpleLossLoggerCallback(tensors=[loss],
                                            print_func=partial(
                                                self.print_and_log_loss,
                                                loss_log_list=loss_list),
                                            step_freq=1)

        self.nf.train(
            [loss],
            callbacks=[callback],
            optimizer="sgd",
            optimization_params={
                "max_steps": 3,
                "lr": 0.001
            },
        )
        self.nf.reset_trainer()

        # Assert that training loss went down
        assert loss_list[-1] < loss_list[0]
Beispiel #7
0
                                        mode="train")

        # evaluation pipelines
        eval_tensors = create_pipeline(eval_examples,
                                       args.eval_batch_size,
                                       mode="eval")

        def print_loss(x):
            loss = x[0].item()
            logging.info("Training loss: {:.4f}".format(loss))

        # callbacks
        callback_train = SimpleLossLoggerCallback(
            tensors=[train_tensors[0]],
            step_freq=100,
            print_func=print_loss,
            get_tb_values=lambda x: [["loss", x[0]]],
            tb_writer=nf.tb_writer,
        )

        callbacks = [callback_train]

        # for eval_examples in args.eval_file_preprocessed:
        callback_eval = EvaluatorCallback(
            eval_tensors=eval_tensors,
            user_iter_callback=lambda x, y: eval_iter_callback(
                x, y, tokenizer),
            user_epochs_done_callback=eval_epochs_done_callback,
            eval_step=args.eval_freq,
            tb_writer=nf.tb_writer,
        )
Beispiel #8
0
    def test_tacotron2_training(self):
        """Integtaion test that instantiates a smaller Tacotron2 model and tests training with the sample asr data.
        Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss
        at the first step.
        """
        data_layer = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=4
        )
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            window_size=None,
            window_stride=None,
            n_window_size=512,
            n_window_stride=128,
            normalize=None,
            preemph=None,
            dither=0,
            mag_power=1.0,
            pad_value=-11.52,
            log_zero_guard_type="clamp",
            log_zero_guard_value=1e-05,
        )
        text_embedding = nemo_tts.TextEmbedding(len(self.labels), 256)
        t2_enc = nemo_tts.Tacotron2Encoder(encoder_n_convolutions=2, encoder_kernel_size=5, encoder_embedding_dim=256)
        t2_dec = nemo_tts.Tacotron2Decoder(
            n_mel_channels=64,
            n_frames_per_step=1,
            encoder_embedding_dim=256,
            gate_threshold=0.5,
            prenet_dim=128,
            max_decoder_steps=1000,
            decoder_rnn_dim=512,
            p_decoder_dropout=0.1,
            p_attention_dropout=0.1,
            attention_rnn_dim=512,
            attention_dim=64,
            attention_location_n_filters=16,
            attention_location_kernel_size=15,
        )
        t2_postnet = nemo_tts.Tacotron2Postnet(
            n_mel_channels=64, postnet_embedding_dim=256, postnet_kernel_size=5, postnet_n_convolutions=3
        )
        t2_loss = nemo_tts.Tacotron2Loss()
        makegatetarget = nemo_tts.MakeGate()

        # DAG
        audio, audio_len, transcript, transcript_len = data_layer()
        spec_target, spec_target_len = preprocessing(input_signal=audio, length=audio_len)

        transcript_embedded = text_embedding(char_phone=transcript)
        transcript_encoded = t2_enc(char_phone_embeddings=transcript_embedded, embedding_length=transcript_len)
        mel_decoder, gate, _ = t2_dec(
            char_phone_encoded=transcript_encoded, encoded_length=transcript_len, mel_target=spec_target
        )
        mel_postnet = t2_postnet(mel_input=mel_decoder)
        gate_target = makegatetarget(mel_target=spec_target, target_len=spec_target_len)
        loss_t = t2_loss(
            mel_out=mel_decoder,
            mel_out_postnet=mel_postnet,
            gate_out=gate,
            mel_target=spec_target,
            gate_target=gate_target,
            target_len=spec_target_len,
            seq_len=audio_len,
        )
        loss_list = []

        callback = SimpleLossLoggerCallback(
            tensors=[loss_t], print_func=partial(self.print_and_log_loss, loss_log_list=loss_list), step_freq=1
        )
        # Instantiate an optimizer to perform `train` action
        optimizer = PtActions()
        optimizer.train(
            [loss_t], callbacks=[callback], optimizer="sgd", optimization_params={"max_steps": 3, "lr": 0.01}
        )

        # Assert that training loss went down
        assert loss_list[-1] < loss_list[0]
Beispiel #9
0
    def test_fastspeech(self):
        """Integtaion test that instantiates a FastSpeech model and tests training with the sample asr data.
        Note instantiating the FastSpeech model additionally requires creating speech durations which additionally
        tests NeuralModuleFactory.infer().
        Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss
        at the first step.
        """
        data_layer = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=1,
            shuffle=False,
            sample_rate=16000,
        )

        data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
            window_size=None,
            window_stride=None,
            n_window_size=512,
            n_window_stride=128,
            normalize=None,
            preemph=None,
            dither=0,
            mag_power=1.0,
            pad_value=-11.52,
            pad_to=0,
            log_zero_guard_type="clamp",
            log_zero_guard_value=1e-05,
        )

        data = data_layer()
        spec, spec_length = data_preprocessor(input_signal=data.audio_signal, length=data.a_sig_length)

        # Creates and saves durations as numpy arrays.
        durs_dir = pathlib.Path('tests/data/asr/durs')
        durs_dir.mkdir(exist_ok=True)
        result = self.nf.infer([data.transcripts, data.transcript_length, spec_length, spec])
        k = -1
        for text, text_len, mel_len, mel in zip(result[0], result[1], result[2], result[3]):
            text = text.cpu().numpy()[0][: text_len.cpu().numpy()[0]]
            dur = np.zeros(text.shape[0], dtype=np.long)
            dur_sum = mel_len.cpu().numpy()[0] + 1  # TODO: delete `+1`
            dur[0] = dur_sum - 4
            dur[1] = 4
            k += 1
            np.save(durs_dir / f'{k}.npy', dur, allow_pickle=False)

        data_layer = nemo_tts.FastSpeechDataLayer(
            manifest_filepath=self.manifest_filepath,
            durs_dir=durs_dir,
            labels=self.labels,
            batch_size=4,
            sample_rate=16000,
        )

        fastspeech = nemo_tts.FastSpeech(
            decoder_output_size=384,
            n_mels=64,
            max_seq_len=2048,
            word_vec_dim=384,
            encoder_n_layer=6,
            encoder_head=2,
            encoder_conv1d_filter_size=1536,
            decoder_n_layer=6,
            decoder_head=2,
            decoder_conv1d_filter_size=1536,
            fft_conv1d_kernel=3,
            fft_conv1d_padding=1,
            encoder_output_size=384,
            duration_predictor_filter_size=256,
            duration_predictor_kernel_size=3,
            dropout=0.1,
            alpha=1.0,
            n_src_vocab=len(self.labels),
            pad_id=0,
        )

        loss = nemo_tts.FastSpeechLoss()

        data = data_layer()
        mel_true, _ = data_preprocessor(input_signal=data.audio, length=data.audio_len)
        mel_pred, dur_pred = fastspeech(
            text=data.text, text_pos=data.text_pos, mel_true=mel_true, dur_true=data.dur_true,
        )
        loss_t = loss(
            mel_true=mel_true, mel_pred=mel_pred, dur_true=data.dur_true, dur_pred=dur_pred, text_pos=data.text_pos,
        )

        loss_list = []
        callback = SimpleLossLoggerCallback(
            tensors=[loss_t], print_func=partial(self.print_and_log_loss, loss_log_list=loss_list), step_freq=1
        )
        # Instantiate an optimizer to perform `train` action
        optimizer = PtActions()
        optimizer.train(
            [loss_t], callbacks=[callback], optimizer="sgd", optimization_params={"max_steps": 3, "lr": 0.0003}
        )

        # Assert that training loss went down
        assert loss_list[-1] < loss_list[0]
Beispiel #10
0
)
eval_tensors, _, _, eval_data_layer = create_pipeline(
    num_samples=args.num_eval_samples,
    batch_size=args.batch_size,
    data_prefix=args.eval_file_prefix,
    is_training=False,
    num_gpus=args.num_gpus,
)

# Create callbacks for train and eval modes
train_callback = SimpleLossLoggerCallback(
    tensors=train_tensors,
    print_func=lambda x: logging.info(
        f'Total Loss:{str(round(x[0].item(), 3))}, '
        f'Intent Loss:{str(round(x[1].item(), 3))}, '
        f'Slot Tagging Loss:{str(round(x[2].item(), 3))}'),
    tb_writer=nf.tb_writer,
    get_tb_values=lambda x: [["total_loss", x[0]], ["intent_loss", x[1]],
                             ["slot_loss", x[2]]],
    step_freq=train_steps_per_epoch,
)

eval_callback = nemo.core.EvaluatorCallback(
    eval_tensors=eval_tensors,
    user_iter_callback=lambda x, y: eval_iter_callback(x, y),
    user_epochs_done_callback=lambda x: eval_epochs_done_callback(
        x,
        intents_label_ids=data_desc.intents_label_ids,
        slots_label_ids=data_desc.slots_label_ids,
        graph_fold=f'{nf.work_dir}/graphs',
        normalize_cm=True,