Ejemplo n.º 1
0
 def test_trim_silence(self):
     batch_size = 4
     normal_dl = nemo_asr.AudioToTextDataLayer(
         # featurizer_config=self.featurizer_config,
         manifest_filepath=self.manifest_filepath,
         labels=self.labels,
         batch_size=batch_size,
         # placement=DeviceType.GPU,
         drop_last=True,
         shuffle=False,
     )
     trimmed_dl = nemo_asr.AudioToTextDataLayer(
         # featurizer_config=self.featurizer_config,
         manifest_filepath=self.manifest_filepath,
         trim_silence=True,
         labels=self.labels,
         batch_size=batch_size,
         # placement=DeviceType.GPU,
         drop_last=True,
         shuffle=False,
     )
     for norm, trim in zip(normal_dl.data_iterator,
                           trimmed_dl.data_iterator):
         for point in range(batch_size):
             self.assertTrue(norm[1][point].data >= trim[1][point].data)
Ejemplo n.º 2
0
    def wav_to_text(self, manifest, greedy=True):

        data_layer = nemo_asr.AudioToTextDataLayer(shuffle=False,
                                                   manifest_filepath=manifest,
                                                   labels=self.labels,
                                                   batch_size=1)
        audio_signal, audio_signal_len, transcript, transcript_len = data_layer(
        )
        log_probs, encoded_len = self.asr_model(input_signal=audio_signal,
                                                length=audio_signal_len)
        predictions = self.greedy_decoder(log_probs=log_probs)
        eval_tensors = [predictions]

        if self.ENABLE_NGRAM:
            print('Running with beam search')
            beam_predictions = self.beam_search_with_lm(
                log_probs=log_probs, log_probs_length=encoded_len)
            eval_tensors.append(beam_predictions)

        tensors = self.neural_factory.infer(tensors=eval_tensors)
        if greedy:
            prediction = post_process_predictions(tensors[0], self.labels)
        else:
            prediction = tensors[0][0][0][0][1]
        del data_layer
        del eval_tensors
        del beam_predictions
        del predictions
        del tensors
        del audio_signal, audio_signal_len, transcript, transcript_len
        del log_probs, encoded_len
        return prediction
Ejemplo n.º 3
0
        def wrong():
            with open("tests/data/jasper_smaller.yaml") as file:
                jasper_config = self.yaml.load(file)
            labels = jasper_config['labels']

            data_layer = nemo_asr.AudioToTextDataLayer(
                manifest_filepath=self.manifest_filepath,
                labels=labels,
                batch_size=4,
            )
            data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
                **jasper_config['AudioToMelSpectrogramPreprocessor'])
            jasper_encoder = nemo_asr.JasperEncoder(
                feat_in=jasper_config['AudioToMelSpectrogramPreprocessor']
                ['features'],
                **jasper_config['JasperEncoder'],
            )
            jasper_decoder = nemo_asr.JasperDecoderForCTC(
                feat_in=1024, num_classes=len(labels))
            # DAG definition
            (
                audio_signal,
                audio_signal_len,
                transcript,
                transcript_len,
            ) = data_layer()
            processed_signal, processed_signal_len = data_preprocessor(
                input_signal=audio_signal, length=audio_signal_len)

            spec_augment = nemo_asr.SpectrogramAugmentation(rect_masks=5)
            aug_signal = spec_augment(input_spec=processed_signal)

            encoded, encoded_len = jasper_encoder(audio_signal=aug_signal,
                                                  length=processed_signal_len)
            log_probs = jasper_decoder(encoder_output=processed_signal)
Ejemplo n.º 4
0
    def test_quartznet_model_training(self):
        """Integtaion test that instantiates a small Jasper model and tests training with the sample asr data.
        test_stft_conv_training tests the torch_stft path while test_jasper_training tests the torch.stft path inside
        of AudioToMelSpectrogramPreprocessor.
        Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss
        at the first step.
        Note: Training is done with batch gradient descent as opposed to stochastic gradient descent due to CTC loss
        """
        with open(
                os.path.abspath(
                    os.path.join(os.path.dirname(__file__),
                                 "../../examples/asr/configs/jasper_an4.yaml"))
        ) as file:
            model_definition = self.yaml.load(file)
        dl = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=30)
        model = nemo_asr.models.ASRConvCTCModel(
            preprocessor_params=model_definition[
                'AudioToMelSpectrogramPreprocessor'],
            encoder_params=model_definition['JasperEncoder'],
            decoder_params=model_definition['JasperDecoderForCTC'],
        )
        model.train()
        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))

        # DAG
        audio_signal, a_sig_length, transcript, transcript_len = dl()
        log_probs, encoded_len = model(input_signal=audio_signal,
                                       length=a_sig_length)
        loss = ctc_loss(
            log_probs=log_probs,
            targets=transcript,
            input_length=encoded_len,
            target_length=transcript_len,
        )

        loss_list = []
        callback = nemo.core.SimpleLossLoggerCallback(
            tensors=[loss],
            print_func=partial(self.print_and_log_loss,
                               loss_log_list=loss_list),
            step_freq=1)

        self.nf.train(
            [loss],
            callbacks=[callback],
            optimizer="sgd",
            optimization_params={
                "max_steps": 3,
                "lr": 0.001
            },
        )
        self.nf.reset_trainer()

        # Assert that training loss went down
        assert loss_list[-1] < loss_list[0]
Ejemplo n.º 5
0
def create_dag(args, cfg, num_gpus):
    # Defining nodes
    data = nemo_asr.TranscriptDataLayer(
        path=args.train_dataset,
        labels=cfg['target']['labels'],
        eos_id=cfg['target']['eos_id'],
        pad_id=cfg['target']['pad_id'],
        batch_size=cfg['optimization']['batch_size'],
        drop_last=True,
    )
    data_eval = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.eval_datasets,
        labels=cfg['target']['labels'],
        eos_id=cfg['target']['eos_id'],
        batch_size=cfg['inference']['batch_size'],
        load_audio=False,
    )
    decoder = nemo.backends.pytorch.DecoderRNN(
        voc_size=len(cfg['target']['labels']), bos_id=cfg['target']['bos_id'], **cfg['DecoderRNN'],
    )
    num_data = len(data)
    batch_size = cfg['optimization']['batch_size']
    num_epochs = cfg['optimization']['params']['num_epochs']
    steps_per_epoch = int(num_data / (batch_size))
    total_steps = num_epochs * steps_per_epoch
    vsc = ValueSetterCallback
    tf_callback = ValueSetterCallback(
        decoder,
        'teacher_forcing',
        policies=[vsc.Policy(vsc.Method.Const(1.0), start=0.0, end=1.0),],
        total_steps=total_steps,
    )
    seq_loss = nemo.backends.pytorch.SequenceLoss(
        pad_id=cfg['target']['pad_id'], smoothing_coef=cfg['optimization']['smoothing_coef'],
    )
    saver_callback = nemo.core.ModuleSaverCallback(
        save_modules_list=[decoder], folder=args.checkpoint_dir, step_freq=args.checkpoint_save_freq,
    )

    # Creating DAG
    texts, _ = data()
    log_probs, _ = decoder(targets=texts)
    train_loss = seq_loss(log_probs=log_probs, targets=texts)
    evals = []
    _, _, texts, _ = data_eval()
    log_probs, _ = decoder(targets=texts)
    eval_loss = seq_loss(log_probs=log_probs, targets=texts)
    evals.append((args.eval_datasets, (eval_loss, log_probs, texts)))

    # Update config
    cfg['num_params'] = {'decoder': decoder.num_weights}
    cfg['num_params']['total'] = sum(cfg['num_params'].values())
    cfg['input']['train'] = {'num_data': num_data}
    cfg['optimization']['steps_per_epoch'] = steps_per_epoch
    cfg['optimization']['total_steps'] = total_steps

    return (train_loss, evals), cfg, [tf_callback, saver_callback]
Ejemplo n.º 6
0
def recognize_speech():
    if torch.cuda.is_available():
        neural_factory = nemo.core.NeuralModuleFactory(
            placement=nemo.core.DeviceType.GPU,
            optimization_level=nemo.core.Optimization.mxprO1)
    else:
        neural_factory = nemo.core.NeuralModuleFactory(
            placement=nemo.core.DeviceType.CPU)

    # noinspection PyTypeChecker
    asr_model: ASRConvCTCModel = nemo_asr.models.ASRConvCTCModel.from_pretrained(
        model_info=args.am_path)
    asr_model.eval()

    beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM(
        vocab=asr_model.vocabulary,
        beam_width=args.beam_size,
        alpha=args.alpha,
        beta=args.beta,
        lm_path=args.lm_path,
        num_cpus=max(os.cpu_count(), 1))

    # Create dummy manifest with single file
    manifest_path = "manifest.transcription"
    with open(manifest_path, 'w') as f:
        f.write(
            json.dumps({
                "audio_filepath": args.wav_path,
                "duration": 18000,
                "text": "todo"
            }))

    data_layer = nemo_asr.AudioToTextDataLayer(shuffle=False,
                                               manifest_filepath=manifest_path,
                                               labels=asr_model.vocabulary,
                                               batch_size=args.batch_size)

    audio_signal, audio_signal_len, _, _ = data_layer()

    log_probs, encoded_len = asr_model(input_signal=audio_signal,
                                       length=audio_signal_len)
    beam_predictions = beam_search_with_lm(log_probs=log_probs,
                                           log_probs_length=encoded_len)
    eval_tensors = [beam_predictions]

    tensors = neural_factory.infer(tensors=eval_tensors,
                                   use_cache=False,
                                   cache=False,
                                   offload_to_cpu=True)

    batch = tensors[-1][0]
    prediction = batch[0]
    candidates = [candidate[1] for candidate in prediction]

    with open(args.output_path, 'w') as f:
        for candidate in candidates:
            f.write(candidate + "\n")
Ejemplo n.º 7
0
    def test_freeze_unfreeze_TrainableNM(self):
        path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../data/jasper_smaller.yaml"))
        with open(path) as file:
            jasper_model_definition = self.yaml.load(file)
        dl = nemo_asr.AudioToTextDataLayer(
            # featurizer_config=self.featurizer_config,
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=4,
        )
        pre_process_params = {
            #'int_values': False,
            'frame_splicing': 1,
            'features': 64,
            'window_size': 0.02,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
        }
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(**pre_process_params)
        jasper_encoder = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition['JasperEncoder'],
        )
        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(self.labels))
        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))
        jasper_encoder.freeze()
        jasper_encoder.unfreeze(set(['encoder.4.mconv.0.conv.weight']))
        frozen_weight = jasper_encoder.encoder[1].mconv[0].conv.weight.detach().cpu().numpy()
        unfrozen_weight = jasper_encoder.encoder[4].mconv[0].conv.weight.detach().cpu().numpy()
        # jasper_decoder.unfreeze()
        # DAG
        audio_signal, a_sig_length, transcript, transcript_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length)

        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length)
        # logging.info(jasper_encoder)
        log_probs = jasper_decoder(encoder_output=encoded)
        loss = ctc_loss(
            log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,
        )

        callback = nemo.core.SimpleLossLoggerCallback(
            tensors=[loss], print_func=lambda x: logging.info(f'Train Loss: {str(x[0].item())}'),
        )
        optimizer = self.nf.get_trainer()
        optimizer.train(
            [loss], callbacks=[callback], optimizer="sgd", optimization_params={"max_steps": 5, "lr": 0.0003},
        )
        new_frozen_weight = jasper_encoder.encoder[1].mconv[0].conv.weight.data
        new_unfrozen_weight = jasper_encoder.encoder[4].mconv[0].conv.weight.data
        self.assertTrue(np.array_equal(frozen_weight, new_frozen_weight.detach().cpu().numpy()))
        self.assertFalse(np.array_equal(unfrozen_weight, new_unfrozen_weight.detach().cpu().numpy()))
Ejemplo n.º 8
0
    def test_freeze_unfreeze_TrainableNM(self):
        with open("tests/data/jasper_smaller.yaml") as file:
            jasper_model_definition = self.yaml.load(file)
        dl = nemo_asr.AudioToTextDataLayer(
            featurizer_config=self.featurizer_config,
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=4,
        )
        pre_process_params = {
            'int_values': False,
            'frame_splicing': 1,
            'features': 64,
            'window_size': 0.02,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
        }
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(**pre_process_params)
        jasper_encoder = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition['JasperEncoder'],
        )
        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(self.labels))
        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))
        jasper_encoder.freeze()
        jasper_encoder.unfreeze(set(['encoder.4.conv.1.weight']))
        jasper_decoder.unfreeze()
        # DAG
        audio_signal, a_sig_length, transcript, transcript_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length)

        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length)
        # print(jasper_encoder)
        log_probs = jasper_decoder(encoder_output=encoded)
        loss = ctc_loss(
            log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,
        )

        callback = nemo.core.SimpleLossLoggerCallback(
            tensors=[loss], print_func=lambda x: print(f'Train Loss: {str(x[0].item())}'),
        )
        # Instantiate an optimizer to perform `train` action
        neural_factory = nemo.core.NeuralModuleFactory(
            backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False,
        )
        optimizer = neural_factory.get_trainer()
        optimizer.train(
            [loss], callbacks=[callback], optimizer="sgd", optimization_params={"num_epochs": 2, "lr": 0.0003},
        )
Ejemplo n.º 9
0
    def test_audio_preprocessors(self):
        batch_size = 5
        dl = nemo_asr.AudioToTextDataLayer(
            # featurizer_config=self.featurizer_config,
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=batch_size,
            # placement=DeviceType.GPU,
            drop_last=True,
            shuffle=False,
        )

        installed_torchaudio = True
        try:
            import torchaudio
        except ModuleNotFoundError:
            installed_torchaudio = False
            with self.assertRaises(ModuleNotFoundError):
                to_spectrogram = nemo_asr.AudioToSpectrogramPreprocessor(
                    n_fft=400, window=None)
            with self.assertRaises(ModuleNotFoundError):
                to_mfcc = nemo_asr.AudioToMFCCPreprocessor(n_mfcc=15)

        if installed_torchaudio:
            to_spectrogram = nemo_asr.AudioToSpectrogramPreprocessor(
                n_fft=400, window=None)
            to_mfcc = nemo_asr.AudioToMFCCPreprocessor(n_mfcc=15)

        to_melspec = nemo_asr.AudioToMelSpectrogramPreprocessor(features=50)

        for batch in dl.data_iterator:
            input_signals, seq_lengths, _, _ = batch
            input_signals = input_signals.to(to_melspec._device)
            seq_lengths = seq_lengths.to(to_melspec._device)

            melspec = to_melspec.forward(input_signals, seq_lengths)

            if installed_torchaudio:
                spec = to_spectrogram.forward(input_signals, seq_lengths)
                mfcc = to_mfcc.forward(input_signals, seq_lengths)

            # Check that number of features is what we expect
            self.assertTrue(melspec[0].shape[1] == 50)

            if installed_torchaudio:
                self.assertTrue(spec[0].shape[1] == 201)  # n_fft // 2 + 1 bins
                self.assertTrue(mfcc[0].shape[1] == 15)
Ejemplo n.º 10
0
Archivo: routes.py Proyecto: yoks/NeMo
def wav_to_text(manifest, greedy=True):
    from ruamel.yaml import YAML

    yaml = YAML(typ="safe")
    with open(MODEL_YAML) as f:
        jasper_model_definition = yaml.load(f)
    labels = jasper_model_definition['labels']

    # Instantiate necessary neural modules
    data_layer = nemo_asr.AudioToTextDataLayer(shuffle=False,
                                               manifest_filepath=manifest,
                                               labels=labels,
                                               batch_size=1)

    # Define inference DAG
    audio_signal, audio_signal_len, _, _ = data_layer()
    processed_signal, processed_signal_len = data_preprocessor(
        input_signal=audio_signal, length=audio_signal_len)
    encoded, encoded_len = jasper_encoder(audio_signal=processed_signal,
                                          length=processed_signal_len)
    log_probs = jasper_decoder(encoder_output=encoded)
    predictions = greedy_decoder(log_probs=log_probs)

    if ENABLE_NGRAM:
        logging.info('Running with beam search')
        beam_predictions = beam_search_with_lm(log_probs=log_probs,
                                               log_probs_length=encoded_len)
        eval_tensors = [beam_predictions]

    if greedy:
        eval_tensors = [predictions]

    tensors = neural_factory.infer(tensors=eval_tensors)
    if greedy:
        from nemo.collections.asr.helpers import post_process_predictions

        prediction = post_process_predictions(tensors[0], labels)
    else:
        prediction = tensors[0][0][0][0][1]
    return prediction
Ejemplo n.º 11
0
 def test_dataloader(self):
     batch_size = 4
     dl = nemo_asr.AudioToTextDataLayer(
         # featurizer_config=self.featurizer_config,
         manifest_filepath=self.manifest_filepath,
         labels=self.labels,
         batch_size=batch_size,
         # placement=DeviceType.GPU,
         drop_last=True,
     )
     for ind, data in enumerate(dl.data_iterator):
         # With num_workers update, this is no longer true
         # Moving to GPU is handled by AudioPreprocessor
         # data is on GPU
         # self.assertTrue(data[0].is_cuda)
         # self.assertTrue(data[1].is_cuda)
         # self.assertTrue(data[2].is_cuda)
         # self.assertTrue(data[3].is_cuda)
         # first dimension is batch
         self.assertTrue(data[0].size(0) == batch_size)
         self.assertTrue(data[1].size(0) == batch_size)
         self.assertTrue(data[2].size(0) == batch_size)
         self.assertTrue(data[3].size(0) == batch_size)
Ejemplo n.º 12
0
    def test_clas(self):
        with open('examples/asr/experimental/configs/garnet_an4.yaml') as file:
            cfg = self.yaml.load(file)
        dl = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=4,
        )
        pre_process_params = {
            'frame_splicing': 1,
            'features': 64,
            'window_size': 0.02,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
            'stft_conv': True,
        }
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            **pre_process_params)
        encoder = nemo_asr.JasperEncoder(
            jasper=cfg['encoder']['jasper'],
            activation=cfg['encoder']['activation'],
            feat_in=cfg['input']['train']['features'],
        )
        connector = nemo_asr.JasperRNNConnector(
            in_channels=cfg['encoder']['jasper'][-1]['filters'],
            out_channels=cfg['decoder']['hidden_size'],
        )
        decoder = nemo.backends.pytorch.common.DecoderRNN(
            voc_size=len(self.labels),
            bos_id=0,
            hidden_size=cfg['decoder']['hidden_size'],
            attention_method=cfg['decoder']['attention_method'],
            attention_type=cfg['decoder']['attention_type'],
            in_dropout=cfg['decoder']['in_dropout'],
            gru_dropout=cfg['decoder']['gru_dropout'],
            attn_dropout=cfg['decoder']['attn_dropout'],
            teacher_forcing=cfg['decoder']['teacher_forcing'],
            curriculum_learning=cfg['decoder']['curriculum_learning'],
            rnn_type=cfg['decoder']['rnn_type'],
            n_layers=cfg['decoder']['n_layers'],
            tie_emb_out_weights=cfg['decoder']['tie_emb_out_weights'],
        )
        loss = nemo.backends.pytorch.common.SequenceLoss()

        # DAG
        audio_signal, a_sig_length, transcripts, transcript_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal,
                                                   length=a_sig_length)
        encoded, encoded_len = encoder(audio_signal=processed_signal,
                                       length=p_length)
        encoded = connector(tensor=encoded)
        log_probs, _ = decoder(targets=transcripts, encoder_outputs=encoded)
        loss = loss(log_probs=log_probs, targets=transcripts)

        # Train
        callback = nemo.core.SimpleLossLoggerCallback(
            tensors=[loss],
            print_func=lambda x: logging.info(str(x[0].item())))
        # Instantiate an optimizer to perform `train` action
        optimizer = self.nf.get_trainer()
        optimizer.train(
            [loss],
            callbacks=[callback],
            optimizer="sgd",
            optimization_params={
                "num_epochs": 10,
                "lr": 0.0003
            },
        )
Ejemplo n.º 13
0
    def test_stft_conv(self):
        with open(
                os.path.abspath(
                    os.path.join(os.path.dirname(__file__),
                                 "../data/jasper_smaller.yaml"))) as file:
            jasper_model_definition = self.yaml.load(file)
        dl = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=4,
        )
        pre_process_params = {
            'frame_splicing': 1,
            'features': 64,
            'window_size': 0.02,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
            'stft_conv': True,
        }
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            **pre_process_params)
        jasper_encoder = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition[
                'AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition['JasperEncoder'],
        )
        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024,
                                                      num_classes=len(
                                                          self.labels))

        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))

        # DAG
        audio_signal, a_sig_length, transcript, transcript_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal,
                                                   length=a_sig_length)

        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal,
                                              length=p_length)
        # logging.info(jasper_encoder)
        log_probs = jasper_decoder(encoder_output=encoded)
        loss = ctc_loss(
            log_probs=log_probs,
            targets=transcript,
            input_length=encoded_len,
            target_length=transcript_len,
        )

        callback = nemo.core.SimpleLossLoggerCallback(
            tensors=[loss],
            print_func=lambda x: logging.info(str(x[0].item())))
        # Instantiate an optimizer to perform `train` action
        optimizer = self.nf.get_trainer()
        optimizer.train(
            [loss],
            callbacks=[callback],
            optimizer="sgd",
            optimization_params={
                "num_epochs": 10,
                "lr": 0.0003
            },
        )
Ejemplo n.º 14
0
    def test_tacotron2_training(self):
        """Integtaion test that instantiates a smaller Tacotron2 model and tests training with the sample asr data.
        Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss
        at the first step.
        """
        data_layer = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=4
        )
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            window_size=None,
            window_stride=None,
            n_window_size=512,
            n_window_stride=128,
            normalize=None,
            preemph=None,
            dither=0,
            mag_power=1.0,
            pad_value=-11.52,
            log_zero_guard_type="clamp",
            log_zero_guard_value=1e-05,
        )
        text_embedding = nemo_tts.TextEmbedding(len(self.labels), 256)
        t2_enc = nemo_tts.Tacotron2Encoder(encoder_n_convolutions=2, encoder_kernel_size=5, encoder_embedding_dim=256)
        t2_dec = nemo_tts.Tacotron2Decoder(
            n_mel_channels=64,
            n_frames_per_step=1,
            encoder_embedding_dim=256,
            gate_threshold=0.5,
            prenet_dim=128,
            max_decoder_steps=1000,
            decoder_rnn_dim=512,
            p_decoder_dropout=0.1,
            p_attention_dropout=0.1,
            attention_rnn_dim=512,
            attention_dim=64,
            attention_location_n_filters=16,
            attention_location_kernel_size=15,
        )
        t2_postnet = nemo_tts.Tacotron2Postnet(
            n_mel_channels=64, postnet_embedding_dim=256, postnet_kernel_size=5, postnet_n_convolutions=3
        )
        t2_loss = nemo_tts.Tacotron2Loss()
        makegatetarget = nemo_tts.MakeGate()

        # DAG
        audio, audio_len, transcript, transcript_len = data_layer()
        spec_target, spec_target_len = preprocessing(input_signal=audio, length=audio_len)

        transcript_embedded = text_embedding(char_phone=transcript)
        transcript_encoded = t2_enc(char_phone_embeddings=transcript_embedded, embedding_length=transcript_len)
        mel_decoder, gate, _ = t2_dec(
            char_phone_encoded=transcript_encoded, encoded_length=transcript_len, mel_target=spec_target
        )
        mel_postnet = t2_postnet(mel_input=mel_decoder)
        gate_target = makegatetarget(mel_target=spec_target, target_len=spec_target_len)
        loss_t = t2_loss(
            mel_out=mel_decoder,
            mel_out_postnet=mel_postnet,
            gate_out=gate,
            mel_target=spec_target,
            gate_target=gate_target,
            target_len=spec_target_len,
            seq_len=audio_len,
        )
        loss_list = []

        callback = SimpleLossLoggerCallback(
            tensors=[loss_t], print_func=partial(self.print_and_log_loss, loss_log_list=loss_list), step_freq=1
        )
        # Instantiate an optimizer to perform `train` action
        optimizer = PtActions()
        optimizer.train(
            [loss_t], callbacks=[callback], optimizer="sgd", optimization_params={"max_steps": 3, "lr": 0.01}
        )

        # Assert that training loss went down
        assert loss_list[-1] < loss_list[0]
Ejemplo n.º 15
0
def create_all_dags(args, neural_factory):
    '''
    creates train and eval dags as well as their callbacks
    returns train loss tensor and callbacks'''

    # parse the config files
    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        quartz_params = yaml.load(f)

    try:
        vocab = quartz_params['labels']
        sample_rate = quartz_params['sample_rate']
    except KeyError:
        logging.error("Please make sure you are using older config format (the ones with -old suffix)")
        exit(1)

    # Calculate num_workers for dataloader
    total_cpus = os.cpu_count()
    cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1)

    # create data layer for training
    train_dl_params = copy.deepcopy(quartz_params["AudioToTextDataLayer"])
    train_dl_params.update(quartz_params["AudioToTextDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]
    # del train_dl_params["normalize_transcripts"]

    data_layer_train = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.train_dataset,
        sample_rate=sample_rate,
        labels=vocab,
        batch_size=args.batch_size,
        num_workers=cpu_per_traindl,
        **train_dl_params,
        # normalize_transcripts=False
    )

    N = len(data_layer_train)
    steps_per_epoch = int(N / (args.batch_size * args.iter_per_step * args.num_gpus))

    # create separate data layers for eval
    # we need separate eval dags for separate eval datasets
    # but all other modules in these dags will be shared

    eval_dl_params = copy.deepcopy(quartz_params["AudioToTextDataLayer"])
    eval_dl_params.update(quartz_params["AudioToTextDataLayer"]["eval"])
    del eval_dl_params["train"]
    del eval_dl_params["eval"]

    data_layers_eval = []
    if args.eval_datasets:
        for eval_dataset in args.eval_datasets:
            data_layer_eval = nemo_asr.AudioToTextDataLayer(
                manifest_filepath=eval_dataset,
                sample_rate=sample_rate,
                labels=vocab,
                batch_size=args.eval_batch_size,
                num_workers=cpu_per_traindl,
                **eval_dl_params,
            )

            data_layers_eval.append(data_layer_eval)
    else:
        logging.warning("There were no val datasets passed")

    # create shared modules

    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        sample_rate=sample_rate, **quartz_params["AudioToMelSpectrogramPreprocessor"],
    )

    # (QuartzNet uses the Jasper baseline encoder and decoder)
    encoder = nemo_asr.JasperEncoder(
        feat_in=quartz_params["AudioToMelSpectrogramPreprocessor"]["features"], **quartz_params["JasperEncoder"],
    )

    decoder = nemo_asr.JasperDecoderForCTC(
        feat_in=quartz_params["JasperEncoder"]["jasper"][-1]["filters"], num_classes=len(vocab),
    )

    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))

    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    # create augmentation modules (only used for training) if their configs
    # are present

    multiply_batch_config = quartz_params.get('MultiplyBatch', None)
    if multiply_batch_config:
        multiply_batch = nemo_asr.MultiplyBatch(**multiply_batch_config)

    spectr_augment_config = quartz_params.get('SpectrogramAugmentation', None)
    if spectr_augment_config:
        data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(**spectr_augment_config)

    # assemble train DAG

    (audio_signal_t, a_sig_length_t, transcript_t, transcript_len_t,) = data_layer_train()

    processed_signal_t, p_length_t = data_preprocessor(input_signal=audio_signal_t, length=a_sig_length_t)

    if multiply_batch_config:
        (processed_signal_t, p_length_t, transcript_t, transcript_len_t,) = multiply_batch(
            in_x=processed_signal_t, in_x_len=p_length_t, in_y=transcript_t, in_y_len=transcript_len_t,
        )

    if spectr_augment_config:
        processed_signal_t = data_spectr_augmentation(input_spec=processed_signal_t)

    encoded_t, encoded_len_t = encoder(audio_signal=processed_signal_t, length=p_length_t)
    log_probs_t = decoder(encoder_output=encoded_t)
    predictions_t = greedy_decoder(log_probs=log_probs_t)
    loss_t = ctc_loss(
        log_probs=log_probs_t, targets=transcript_t, input_length=encoded_len_t, target_length=transcript_len_t,
    )

    # create train callbacks
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[loss_t, predictions_t, transcript_t, transcript_len_t],
        print_func=partial(monitor_asr_train_progress, labels=vocab),
        get_tb_values=lambda x: [["loss", x[0]]],
        tb_writer=neural_factory.tb_writer,
    )

    callbacks = [train_callback]

    if args.checkpoint_dir or args.load_dir:
        chpt_callback = nemo.core.CheckpointCallback(
            folder=args.checkpoint_dir, load_from_folder=args.load_dir, step_freq=args.checkpoint_save_freq,
        )

        callbacks.append(chpt_callback)

    # assemble eval DAGs
    for i, eval_dl in enumerate(data_layers_eval):
        (audio_signal_e, a_sig_length_e, transcript_e, transcript_len_e,) = eval_dl()
        processed_signal_e, p_length_e = data_preprocessor(input_signal=audio_signal_e, length=a_sig_length_e)
        encoded_e, encoded_len_e = encoder(audio_signal=processed_signal_e, length=p_length_e)
        log_probs_e = decoder(encoder_output=encoded_e)
        predictions_e = greedy_decoder(log_probs=log_probs_e)
        loss_e = ctc_loss(
            log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e,
        )

        # create corresponding eval callback
        tagname = os.path.basename(args.eval_datasets[i]).split(".")[0]

        eval_callback = nemo.core.EvaluatorCallback(
            eval_tensors=[loss_e, predictions_e, transcript_e, transcript_len_e,],
            user_iter_callback=partial(process_evaluation_batch, labels=vocab),
            user_epochs_done_callback=partial(process_evaluation_epoch, tag=tagname),
            eval_step=args.eval_freq,
            tb_writer=neural_factory.tb_writer,
        )

        callbacks.append(eval_callback)

    return loss_t, callbacks, steps_per_epoch
Ejemplo n.º 16
0
    def test_contextnet_ctc_training(self):
        """Integtaion test that instantiates a small ContextNet model and tests training with the sample asr data.
        Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss
        at the first step.
        Note: Training is done with batch gradient descent as opposed to stochastic gradient descent due to CTC loss
        Checks SE-block with fixed context size and global context, residual_mode='stride_add' and 'stride_last' flags
        """
        with open(
                os.path.abspath(
                    os.path.join(os.path.dirname(__file__),
                                 "../data/contextnet_32.yaml"))) as f:
            contextnet_model_definition = self.yaml.load(f)
        dl = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=30)
        pre_process_params = {
            'frame_splicing': 1,
            'features': 80,
            'window_size': 0.025,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
        }
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            **pre_process_params)

        spec_aug = nemo_asr.SpectrogramAugmentation(
            **contextnet_model_definition['SpectrogramAugmentation'])

        contextnet_encoder = nemo_asr.ContextNetEncoder(
            feat_in=contextnet_model_definition[
                'AudioToMelSpectrogramPreprocessor']['features'],
            **contextnet_model_definition['ContextNetEncoder'],
        )
        contextnet_decoder = nemo_asr.ContextNetDecoderForCTC(feat_in=32,
                                                              hidden_size=16,
                                                              num_classes=len(
                                                                  self.labels))
        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))

        # DAG
        audio_signal, a_sig_length, transcript, transcript_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal,
                                                   length=a_sig_length)

        processed_signal = spec_aug(input_spec=processed_signal)

        encoded, encoded_len = contextnet_encoder(
            audio_signal=processed_signal, length=p_length)
        log_probs = contextnet_decoder(encoder_output=encoded)
        loss = ctc_loss(
            log_probs=log_probs,
            targets=transcript,
            input_length=encoded_len,
            target_length=transcript_len,
        )

        loss_list = []
        callback = SimpleLossLoggerCallback(tensors=[loss],
                                            print_func=partial(
                                                self.print_and_log_loss,
                                                loss_log_list=loss_list),
                                            step_freq=1)

        self.nf.train(
            [loss],
            callbacks=[callback],
            optimizer="sgd",
            optimization_params={
                "max_steps": 3,
                "lr": 0.001
            },
        )
        self.nf.reset_trainer()

        # Assert that training loss went down
        assert loss_list[-1] < loss_list[0]
Ejemplo n.º 17
0
def main():
    parser = argparse.ArgumentParser(description='Jasper')
    # model params
    parser.add_argument("--model_config", type=str, required=True)
    parser.add_argument("--eval_datasets", type=str, required=True)
    parser.add_argument("--load_dir", type=str, required=True)
    # run params
    parser.add_argument("--local_rank", default=None, type=int)
    parser.add_argument("--batch_size", default=64, type=int)
    parser.add_argument("--amp_opt_level", default="O1", type=str)
    # store results
    parser.add_argument("--save_logprob", default=None, type=str)

    # lm inference parameters
    parser.add_argument("--lm_path", default=None, type=str)
    parser.add_argument('--alpha',
                        default=2.0,
                        type=float,
                        help='value of LM weight',
                        required=False)
    parser.add_argument(
        '--alpha_max',
        type=float,
        help='maximum value of LM weight (for a grid search in \'eval\' mode)',
        required=False,
    )
    parser.add_argument('--alpha_step',
                        type=float,
                        help='step for LM weight\'s tuning in \'eval\' mode',
                        required=False,
                        default=0.1)
    parser.add_argument('--beta',
                        default=1.5,
                        type=float,
                        help='value of word count weight',
                        required=False)
    parser.add_argument(
        '--beta_max',
        type=float,
        help='maximum value of word count weight (for a grid search in \
          \'eval\' mode',
        required=False,
    )
    parser.add_argument(
        '--beta_step',
        type=float,
        help='step for word count weight\'s tuning in \'eval\' mode',
        required=False,
        default=0.1,
    )
    parser.add_argument("--beam_width", default=128, type=int)

    args = parser.parse_args()
    batch_size = args.batch_size
    load_dir = args.load_dir

    if args.local_rank is not None:
        if args.lm_path:
            raise NotImplementedError(
                "Beam search decoder with LM does not currently support evaluation on multi-gpu."
            )
        device = nemo.core.DeviceType.AllGpu
    else:
        device = nemo.core.DeviceType.GPU

    # Instantiate Neural Factory with supported backend
    neural_factory = nemo.core.NeuralModuleFactory(
        backend=nemo.core.Backend.PyTorch,
        local_rank=args.local_rank,
        optimization_level=args.amp_opt_level,
        placement=device,
    )

    if args.local_rank is not None:
        logging.info('Doing ALL GPU')

    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)
    vocab = jasper_params['labels']
    sample_rate = jasper_params['sample_rate']

    eval_datasets = args.eval_datasets

    eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"])
    del eval_dl_params["train"]
    del eval_dl_params["eval"]
    data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=eval_datasets,
        sample_rate=sample_rate,
        labels=vocab,
        batch_size=batch_size,
        **eval_dl_params,
    )

    N = len(data_layer)
    logging.info('Evaluating {0} examples'.format(N))

    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        sample_rate=sample_rate,
        **jasper_params["AudioToMelSpectrogramPreprocessor"])
    jasper_encoder = nemo_asr.JasperEncoder(
        feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"],
        **jasper_params["JasperEncoder"])
    jasper_decoder = nemo_asr.JasperDecoderForCTC(
        feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"],
        num_classes=len(vocab))
    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    logging.info('================================')
    logging.info(
        f"Number of parameters in encoder: {jasper_encoder.num_weights}")
    logging.info(
        f"Number of parameters in decoder: {jasper_decoder.num_weights}")
    logging.info(f"Total number of parameters in model: "
                 f"{jasper_decoder.num_weights + jasper_encoder.num_weights}")
    logging.info('================================')

    # Define inference DAG
    audio_signal_e1, a_sig_length_e1, transcript_e1, transcript_len_e1 = data_layer(
    )
    processed_signal_e1, p_length_e1 = data_preprocessor(
        input_signal=audio_signal_e1, length=a_sig_length_e1)
    encoded_e1, encoded_len_e1 = jasper_encoder(
        audio_signal=processed_signal_e1, length=p_length_e1)
    log_probs_e1 = jasper_decoder(encoder_output=encoded_e1)
    predictions_e1 = greedy_decoder(log_probs=log_probs_e1)

    eval_tensors = [
        log_probs_e1, predictions_e1, transcript_e1, transcript_len_e1,
        encoded_len_e1
    ]

    # inference
    evaluated_tensors = neural_factory.infer(tensors=eval_tensors,
                                             checkpoint_dir=load_dir)

    greedy_hypotheses = post_process_predictions(evaluated_tensors[1], vocab)
    references = post_process_transcripts(evaluated_tensors[2],
                                          evaluated_tensors[3], vocab)

    wer = word_error_rate(hypotheses=greedy_hypotheses, references=references)
    logging.info("Greedy WER {:.2f}%".format(wer * 100))

    # Convert logits to list of numpy arrays
    logprob = []
    for i, batch in enumerate(evaluated_tensors[0]):
        for j in range(batch.shape[0]):
            logprob.append(
                batch[j][:evaluated_tensors[4][i][j], :].cpu().numpy())
    if args.save_logprob:
        with open(args.save_logprob, 'wb') as f:
            pickle.dump(logprob, f, protocol=pickle.HIGHEST_PROTOCOL)

    # language model
    if args.lm_path:
        if args.alpha_max is None:
            args.alpha_max = args.alpha
        # include alpha_max in tuning range
        args.alpha_max += args.alpha_step / 10.0

        if args.beta_max is None:
            args.beta_max = args.beta
        # include beta_max in tuning range
        args.beta_max += args.beta_step / 10.0

        beam_wers = []

        logprobexp = [np.exp(p) for p in logprob]
        for alpha in np.arange(args.alpha, args.alpha_max, args.alpha_step):
            for beta in np.arange(args.beta, args.beta_max, args.beta_step):
                logging.info('================================')
                logging.info(f'Infering with (alpha, beta): ({alpha}, {beta})')
                beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM(
                    vocab=vocab,
                    beam_width=args.beam_width,
                    alpha=alpha,
                    beta=beta,
                    lm_path=args.lm_path,
                    num_cpus=max(os.cpu_count(), 1),
                    input_tensor=False,
                )

                beam_predictions = beam_search_with_lm(log_probs=logprobexp,
                                                       log_probs_length=None,
                                                       force_pt=True)

                beam_predictions = [b[0][1] for b in beam_predictions[0]]
                lm_wer = word_error_rate(hypotheses=beam_predictions,
                                         references=references)
                logging.info("Beam WER {:.2f}%".format(lm_wer * 100))
                beam_wers.append(((alpha, beta), lm_wer * 100))

        logging.info('Beam WER for (alpha, beta)')
        logging.info('================================')
        logging.info('\n' + '\n'.join([str(e) for e in beam_wers]))
        logging.info('================================')
        best_beam_wer = min(beam_wers, key=lambda x: x[1])
        logging.info('Best (alpha, beta): '
                     f'{best_beam_wer[0]}, '
                     f'WER: {best_beam_wer[1]:.2f}%')
Ejemplo n.º 18
0
    def test_clas(self):
        with open('examples/asr/experimental/configs/garnet_an4.yaml') as file:
            cfg = self.yaml.load(file)
        dl = nemo_asr.AudioToTextDataLayer(
            featurizer_config=self.featurizer_config,
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=4,
        )
        pre_process_params = {
            'int_values': False,
            'frame_splicing': 1,
            'features': 64,
            'window_size': 0.02,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
            'stft_conv': True,
        }
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            **pre_process_params)
        encoder = nemo_asr.JasperEncoder(
            jasper=cfg['encoder']['jasper'],
            activation=cfg['encoder']['activation'],
            feat_in=cfg['input']['train']['features'],
        )
        connector = nemo_asr.JasperRNNConnector(
            in_channels=cfg['encoder']['jasper'][-1]['filters'],
            out_channels=cfg['decoder']['hidden_size'],
        )
        decoder = nemo.backends.pytorch.common.DecoderRNN(
            voc_size=len(self.labels),
            bos_id=0,
            **cfg['decoder']  # fictive
        )
        loss = nemo.backends.pytorch.common.SequenceLoss()

        # DAG
        audio_signal, a_sig_length, transcripts, transcript_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal,
                                                   length=a_sig_length)
        encoded, encoded_len = encoder(audio_signal=processed_signal,
                                       length=p_length)
        encoded = connector(tensor=encoded)
        log_probs, _ = decoder(targets=transcripts, encoder_outputs=encoded)
        loss = loss(log_probs=log_probs, targets=transcripts)

        # Train
        callback = nemo.core.SimpleLossLoggerCallback(
            tensors=[loss], print_func=lambda x: print(str(x[0].item())))
        # Instantiate an optimizer to perform `train` action
        neural_factory = nemo.core.NeuralModuleFactory(
            backend=nemo.core.Backend.PyTorch,
            local_rank=None,
            create_tb_writer=False,
        )
        optimizer = neural_factory.get_trainer()
        optimizer.train(
            [loss],
            callbacks=[callback],
            optimizer="sgd",
            optimization_params={
                "num_epochs": 10,
                "lr": 0.0003
            },
        )
Ejemplo n.º 19
0
    def test_tacotron2_training(self):
        data_layer = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=4,
        )
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            window_size=None,
            window_stride=None,
            n_window_size=512,
            n_window_stride=128,
            normalize=None,
            preemph=None,
            dither=0,
            mag_power=1.0,
            pad_value=-11.52,
        )
        text_embedding = nemo_tts.TextEmbedding(len(self.labels), 256)
        t2_enc = nemo_tts.Tacotron2Encoder(
            encoder_n_convolutions=2,
            encoder_kernel_size=5,
            encoder_embedding_dim=256,
        )
        t2_dec = nemo_tts.Tacotron2Decoder(
            n_mel_channels=64,
            n_frames_per_step=1,
            encoder_embedding_dim=256,
            gate_threshold=0.5,
            prenet_dim=128,
            max_decoder_steps=1000,
            decoder_rnn_dim=512,
            p_decoder_dropout=0.1,
            p_attention_dropout=0.1,
            attention_rnn_dim=512,
            attention_dim=64,
            attention_location_n_filters=16,
            attention_location_kernel_size=15,
        )
        t2_postnet = nemo_tts.Tacotron2Postnet(
            n_mel_channels=64,
            postnet_embedding_dim=256,
            postnet_kernel_size=5,
            postnet_n_convolutions=3,
        )
        t2_loss = nemo_tts.Tacotron2Loss()
        makegatetarget = nemo_tts.MakeGate()

        # DAG
        audio, audio_len, transcript, transcript_len = data_layer()
        spec_target, spec_target_len = preprocessing(input_signal=audio,
                                                     length=audio_len)

        transcript_embedded = text_embedding(char_phone=transcript)
        transcript_encoded = t2_enc(
            char_phone_embeddings=transcript_embedded,
            embedding_length=transcript_len,
        )
        mel_decoder, gate, _ = t2_dec(
            char_phone_encoded=transcript_encoded,
            encoded_length=transcript_len,
            mel_target=spec_target,
        )
        mel_postnet = t2_postnet(mel_input=mel_decoder)
        gate_target = makegatetarget(mel_target=spec_target,
                                     target_len=spec_target_len)
        loss_t = t2_loss(
            mel_out=mel_decoder,
            mel_out_postnet=mel_postnet,
            gate_out=gate,
            mel_target=spec_target,
            gate_target=gate_target,
            target_len=spec_target_len,
            seq_len=audio_len,
        )

        callback = nemo.core.SimpleLossLoggerCallback(
            tensors=[loss_t],
            print_func=lambda x: logging.info(f'Train Loss: {str(x[0].item())}'
                                              ),
        )
        # Instantiate an optimizer to perform `train` action
        optimizer = nemo.backends.pytorch.actions.PtActions()
        optimizer.train(
            [loss_t],
            callbacks=[callback],
            optimizer="sgd",
            optimization_params={
                "num_epochs": 10,
                "lr": 0.0003
            },
        )
Ejemplo n.º 20
0
def main():
    # Usage and Command line arguments
    parser = ArgumentParser()
    parser.add_argument(
        "--asr_model",
        type=str,
        default="QuartzNet15x5-En",
        required=True,
        help=
        "Pass: '******', 'QuartzNet15x5-Zh', or 'JasperNet10x5-En' to train from pre-trained models. To train from scratch pass path to modelfile ending with .yaml.",
    )
    parser.add_argument(
        "--amp_opt_level",
        default="O0",
        type=str,
        choices=["O0", "O1", "O2", "O3"],
        help="See: https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--train_dataset",
                        type=str,
                        required=True,
                        default=None,
                        help="training dataset path")
    parser.add_argument("--eval_datasets",
                        type=str,
                        nargs="*",
                        help="evaluation datasets paths")
    parser.add_argument("--eval_freq",
                        default=1000,
                        type=int,
                        help="Evaluation frequency")
    parser.add_argument("--eval_batch_size",
                        type=int,
                        default=8,
                        help="batch size to use for evaluation")
    parser.add_argument("--local_rank",
                        default=None,
                        type=int,
                        help="node rank for distributed training")
    parser.add_argument("--stats_freq",
                        default=25,
                        type=int,
                        help="frequency with which to update train stats")
    parser.add_argument("--checkpoint_dir",
                        default=None,
                        type=str,
                        help="Folder where to save checkpoints")
    parser.add_argument("--checkpoint_save_freq",
                        required=False,
                        type=int,
                        help="how often to checkpoint")
    parser.add_argument("--optimizer", default="novograd", type=str)
    parser.add_argument("--warmup_ratio",
                        default=0.02,
                        type=float,
                        help="learning rate warmup ratio")
    parser.add_argument("--batch_size",
                        required=True,
                        type=int,
                        help="train batch size per GPU")
    parser.add_argument("--num_epochs",
                        default=5,
                        type=int,
                        help="number of epochs to train")
    parser.add_argument("--lr", default=0.01, type=float)
    parser.add_argument("--beta1", default=0.95, type=float)
    parser.add_argument("--beta2", default=0.5, type=float)
    parser.add_argument("--weight_decay", default=0.001, type=float)
    parser.add_argument("--iter_per_step",
                        default=1,
                        type=int,
                        help="number of grad accumulations per batch")
    parser.add_argument("--wandb_exp_name", default=None, type=str)
    parser.add_argument("--wandb_project", default=None, type=str)
    parser.add_argument("--max_train_audio_len",
                        default=16.7,
                        type=float,
                        help="max audio length")
    parser.add_argument("--do_not_trim_silence",
                        action="store_false",
                        help="Add this flag to disable silence trimming")
    parser.add_argument("--do_not_normalize_text",
                        action="store_false",
                        help="Add this flag to set to False for non-English.")
    args = parser.parse_args()

    # Setup NeuralModuleFactory to control training
    # instantiate Neural Factory with supported backend
    nf = nemo.core.NeuralModuleFactory(
        local_rank=args.
        local_rank,  # This is necessary for distributed training
        optimization_level=args.
        amp_opt_level,  # This is necessary for mixed precision optimization
        cudnn_benchmark=True,
    )

    # Instantiate the model which we'll train
    if args.asr_model.endswith('.yaml'):
        logging.info(
            f"Speech2Text: Will train from scratch using config from {args.asr_model}"
        )
        asr_model = nemo_asr.models.ASRConvCTCModel.import_from_config(
            args.asr_model)
    else:
        logging.info(f"Speech2Text: Will fine-tune from {args.asr_model}")
        asr_model = nemo_asr.models.ASRConvCTCModel.from_pretrained(
            model_info=args.asr_model, local_rank=args.local_rank)

    if args.asr_model.strip().endswith('-Zh'):
        logging.info('USING CER')
        eval_metric = 'CER'
    else:
        eval_metric = 'WER'

    logging.info("\n\n")
    logging.info(f"Speech2Text: Training on {nf.world_size} GPUs.")
    logging.info(f"Training {type(asr_model)} model.")
    logging.info(f"Training CTC model with alphabet {asr_model.vocabulary}.")
    logging.info(
        f"Training CTC model with {asr_model.num_weights} weights.\n\n")

    train_data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.train_dataset,
        labels=asr_model.vocabulary,
        batch_size=args.batch_size,
        trim_silence=args.do_not_trim_silence,
        max_duration=args.max_train_audio_len,
        shuffle=True,
        normalize_transcripts=args.do_not_normalize_text,
    )
    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(asr_model.vocabulary))
    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    audio_signal, audio_signal_len, transcript, transcript_len = train_data_layer(
    )
    log_probs, encoded_len = asr_model(input_signal=audio_signal,
                                       length=audio_signal_len)
    predictions = greedy_decoder(log_probs=log_probs)
    loss = ctc_loss(log_probs=log_probs,
                    targets=transcript,
                    input_length=encoded_len,
                    target_length=transcript_len)

    # Callbacks which we'll be using:
    callbacks = []
    # SimpleLossLogger prints basic training stats (e.g. loss) to console
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[loss, predictions, transcript, transcript_len],
        step_freq=args.stats_freq,
        print_func=partial(monitor_asr_train_progress,
                           labels=asr_model.vocabulary,
                           eval_metric=eval_metric),
    )
    callbacks.append(train_callback)
    if args.checkpoint_dir is not None and args.checkpoint_save_freq is not None:
        # Checkpoint callback saves checkpoints periodically
        checkpointer_callback = nemo.core.CheckpointCallback(
            folder=args.checkpoint_dir, step_freq=args.checkpoint_save_freq)
        callbacks.append(checkpointer_callback)

    if args.wandb_exp_name is not None and args.wandb_project is not None:
        # WandbCallback saves stats to Weights&Biases
        wandb_callback = nemo.core.WandBLogger(
            step_freq=args.stats_freq,
            wandb_name=args.wandb_exp_name,
            wandb_project=args.wandb_project,
            args=args)
        callbacks.append(wandb_callback)

    # Evaluation
    if args.eval_datasets is not None and args.eval_freq is not None:
        asr_model.eval()  # switch model to evaluation mode
        logging.info(f"Will perform evaluation every {args.eval_freq} steps.")
        for ind, eval_dataset in enumerate(args.eval_datasets):
            eval_data_layer = nemo_asr.AudioToTextDataLayer(
                manifest_filepath=eval_dataset,
                labels=asr_model.vocabulary,
                batch_size=args.eval_batch_size,
                normalize_transcripts=args.do_not_normalize_text,
            )
            audio_signal, audio_signal_len, transcript, transcript_len = eval_data_layer(
            )
            log_probs, encoded_len = asr_model(input_signal=audio_signal,
                                               length=audio_signal_len)
            eval_predictions = greedy_decoder(log_probs=log_probs)
            eval_loss = ctc_loss(log_probs=log_probs,
                                 targets=transcript,
                                 input_length=encoded_len,
                                 target_length=transcript_len)
            tag_name = os.path.basename(eval_dataset).split(".")[0]
            eval_callback = nemo.core.EvaluatorCallback(
                eval_tensors=[
                    eval_loss, eval_predictions, transcript, transcript_len
                ],
                user_iter_callback=partial(process_evaluation_batch,
                                           labels=asr_model.vocabulary),
                user_epochs_done_callback=partial(process_evaluation_epoch,
                                                  tag=tag_name,
                                                  eval_metric=eval_metric),
                eval_step=args.eval_freq,
                wandb_name=args.wandb_exp_name,
                wandb_project=args.wandb_project,
            )
            callbacks.append(eval_callback)

    steps_in_epoch = len(train_data_layer) / (
        args.batch_size * args.iter_per_step * nf.world_size)
    lr_policy = CosineAnnealing(total_steps=args.num_epochs * steps_in_epoch,
                                warmup_ratio=args.warmup_ratio)

    nf.train(
        tensors_to_optimize=[loss],
        callbacks=callbacks,
        optimizer=args.optimizer,
        optimization_params={
            "num_epochs": args.num_epochs,
            "lr": args.lr,
            "betas": (args.beta1, args.beta2),
            "weight_decay": args.weight_decay,
        },
        batches_per_step=args.iter_per_step,
        lr_policy=lr_policy,
    )
Ejemplo n.º 21
0
def create_dags(jasper_params, args, nf):
    vocab = jasper_params['labels']

    # build train and eval model
    train_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    train_dl_params.update(jasper_params["AudioToTextDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]

    data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.train_dataset,
        labels=vocab,
        batch_size=args.batch_size,
        **train_dl_params,
    )

    num_samples = len(data_layer)
    steps_per_epoch = math.ceil(
        num_samples / (args.batch_size * args.iter_per_step * nf.world_size))
    total_steps = steps_per_epoch * args.num_epochs
    logging.info("Train samples=", num_samples, "num_steps=", total_steps)

    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        **jasper_params["AudioToMelSpectrogramPreprocessor"])

    # data_augmentation = nemo_asr.SpectrogramAugmentation(
    #     **jasper_params['SpectrogramAugmentation']
    # )

    eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"])
    del eval_dl_params["train"]
    del eval_dl_params["eval"]

    data_layer_eval = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.eval_datasets,
        labels=vocab,
        batch_size=args.eval_batch_size,
        **eval_dl_params,
    )

    num_samples = len(data_layer_eval)
    logging.info(f"Eval samples={num_samples}")

    jasper_encoder = nemo_asr.JasperEncoder(**jasper_params["JasperEncoder"])

    jasper_decoder = nemo_asr.JasperDecoderForCTC(
        num_classes=len(vocab), **jasper_params["JasperDecoderForCTC"])

    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))

    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    # Training model
    audio, audio_len, transcript, transcript_len = data_layer()
    processed, processed_len = data_preprocessor(input_signal=audio,
                                                 length=audio_len)
    encoded, encoded_len = jasper_encoder(audio_signal=processed,
                                          length=processed_len)
    log_probs = jasper_decoder(encoder_output=encoded)
    predictions = greedy_decoder(log_probs=log_probs)
    loss = ctc_loss(
        log_probs=log_probs,
        targets=transcript,
        input_length=encoded_len,
        target_length=transcript_len,
    )

    # Evaluation model
    audio_e, audio_len_e, transcript_e, transcript_len_e = data_layer_eval()
    processed_e, processed_len_e = data_preprocessor(input_signal=audio_e,
                                                     length=audio_len_e)
    encoded_e, encoded_len_e = jasper_encoder(audio_signal=processed_e,
                                              length=processed_len_e)
    log_probs_e = jasper_decoder(encoder_output=encoded_e)
    predictions_e = greedy_decoder(log_probs=log_probs_e)
    loss_e = ctc_loss(
        log_probs=log_probs_e,
        targets=transcript_e,
        input_length=encoded_len_e,
        target_length=transcript_len_e,
    )
    logging.info("Num of params in encoder: {0}".format(
        jasper_encoder.num_weights))

    # Callbacks to print info to console and Tensorboard
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[loss, predictions, transcript, transcript_len],
        print_func=partial(monitor_asr_train_progress, labels=vocab),
        get_tb_values=lambda x: [["loss", x[0]]],
        tb_writer=nf.tb_writer,
    )

    checkpointer_callback = nemo.core.CheckpointCallback(
        folder=nf.checkpoint_dir, step_freq=args.checkpoint_save_freq)

    eval_tensors = [loss_e, predictions_e, transcript_e, transcript_len_e]
    eval_callback = nemo.core.EvaluatorCallback(
        eval_tensors=eval_tensors,
        user_iter_callback=partial(process_evaluation_batch, labels=vocab),
        user_epochs_done_callback=process_evaluation_epoch,
        eval_step=args.eval_freq,
        tb_writer=nf.tb_writer,
    )
    callbacks = [train_callback, checkpointer_callback, eval_callback]
    return (
        loss,
        eval_tensors,
        callbacks,
        total_steps,
        vocab,
        log_probs_e,
        encoded_len_e,
    )
Ejemplo n.º 22
0
def main():
    # Usage and Command line arguments
    parser = ArgumentParser()
    parser.add_argument(
        "--asr_model",
        type=str,
        default="QuartzNet15x5-En",
        required=True,
        help=
        "Pass: '******', 'QuartzNet15x5-Zh', or 'JasperNet10x5-En'",
    )
    parser.add_argument("--dataset",
                        type=str,
                        required=True,
                        help="path to evaluation data")
    parser.add_argument("--eval_batch_size",
                        type=int,
                        default=1,
                        help="batch size to use for evaluation")
    parser.add_argument("--wer_target",
                        type=float,
                        default=None,
                        help="used by test")
    parser.add_argument("--wer_tolerance",
                        type=float,
                        default=1.0,
                        help="used by test")
    parser.add_argument("--trim_silence",
                        default=True,
                        type=bool,
                        help="trim audio from silence or not")
    parser.add_argument(
        "--normalize_text",
        default=True,
        type=bool,
        help="Normalize transcripts or not. Set to False for non-English.")
    args = parser.parse_args()

    # Setup NeuralModuleFactory to control training
    # instantiate Neural Factory with supported backend
    nf = nemo.core.NeuralModuleFactory()

    # Instantiate the model which we'll train
    logging.info(f"Speech2Text: Will fine-tune from {args.asr_model}")
    asr_model = nemo_asr.models.ASRConvCTCModel.from_pretrained(
        model_info=args.asr_model)
    asr_model.eval()

    logging.info("\n\n")
    logging.info(f"Evaluation using {type(asr_model)} model.")
    logging.info(f"Evaluation using alphabet {asr_model.vocabulary}.")
    logging.info(f"The model has {asr_model.num_weights} weights.\n\n")

    eval_data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.dataset,
        labels=asr_model.vocabulary,
        batch_size=args.eval_batch_size,
        trim_silence=args.trim_silence,
        shuffle=False,
        normalize_transcripts=args.normalize_text,
    )
    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    audio_signal, audio_signal_len, transcript, transcript_len = eval_data_layer(
    )
    log_probs, encoded_len = asr_model(input_signal=audio_signal,
                                       length=audio_signal_len)
    predictions = greedy_decoder(log_probs=log_probs)

    # inference
    eval_tensors = [
        log_probs, predictions, transcript, transcript_len, encoded_len
    ]
    evaluated_tensors = nf.infer(tensors=eval_tensors)

    greedy_hypotheses = post_process_predictions(evaluated_tensors[1],
                                                 asr_model.vocabulary)
    references = post_process_transcripts(evaluated_tensors[2],
                                          evaluated_tensors[3],
                                          asr_model.vocabulary)

    if args.asr_model.strip().endswith('-Zh'):
        val = word_error_rate(hypotheses=greedy_hypotheses,
                              references=references,
                              use_cer=True)
        metric = 'CER'
    else:
        val = word_error_rate(hypotheses=greedy_hypotheses,
                              references=references,
                              use_cer=False)
        metric = 'WER'
    logging.info(f"Greedy {metric} = {val}")
    if args.wer_target is not None:
        if args.wer_target * args.wer_tolerance < wer:
            raise ValueError(
                f"Resulting WER {wer} is higher than the target {args.wer_target}"
            )
Ejemplo n.º 23
0
    def test_stft_conv_training(self):
        """Integtaion test that instantiates a small Jasper model and tests training with the sample asr data.
        test_stft_conv_training tests the torch_stft path while test_jasper_training tests the torch.stft path inside
        of AudioToMelSpectrogramPreprocessor.
        Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss
        at the first step.
        Note: Training is done with batch gradient descent as opposed to stochastic gradient descent due to CTC loss
        """
        with open(
                os.path.abspath(
                    os.path.join(os.path.dirname(__file__),
                                 "../data/jasper_smaller.yaml"))) as file:
            jasper_model_definition = self.yaml.load(file)
        dl = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=30)
        pre_process_params = {
            'frame_splicing': 1,
            'features': 64,
            'window_size': 0.02,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
            'stft_conv': True,
        }
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            **pre_process_params)
        jasper_encoder = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition[
                'AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition['JasperEncoder'],
        )
        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024,
                                                      num_classes=len(
                                                          self.labels))

        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))

        # DAG
        audio_signal, a_sig_length, transcript, transcript_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal,
                                                   length=a_sig_length)

        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal,
                                              length=p_length)
        # logging.info(jasper_encoder)
        log_probs = jasper_decoder(encoder_output=encoded)
        loss = ctc_loss(
            log_probs=log_probs,
            targets=transcript,
            input_length=encoded_len,
            target_length=transcript_len,
        )

        loss_list = []
        callback = SimpleLossLoggerCallback(tensors=[loss],
                                            print_func=partial(
                                                self.print_and_log_loss,
                                                loss_log_list=loss_list),
                                            step_freq=1)

        self.nf.train(
            [loss],
            callbacks=[callback],
            optimizer="sgd",
            optimization_params={
                "max_steps": 3,
                "lr": 0.001
            },
        )
        self.nf.reset_trainer()

        # Assert that training loss went down
        assert loss_list[-1] < loss_list[0]
Ejemplo n.º 24
0
    def test_jasper_eval(self):
        with open(
                os.path.abspath(
                    os.path.join(os.path.dirname(__file__),
                                 "../data/jasper_smaller.yaml"))) as file:
            jasper_model_definition = self.yaml.load(file)
        dl = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=4,
        )
        pre_process_params = {
            'frame_splicing': 1,
            'features': 64,
            'window_size': 0.02,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
        }
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            **pre_process_params)
        jasper_encoder = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition[
                'AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition['JasperEncoder'],
        )
        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024,
                                                      num_classes=len(
                                                          self.labels))
        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))
        greedy_decoder = nemo_asr.GreedyCTCDecoder()
        # DAG
        audio_signal, a_sig_length, transcript, transcript_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal,
                                                   length=a_sig_length)

        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal,
                                              length=p_length)
        # logging.info(jasper_encoder)
        log_probs = jasper_decoder(encoder_output=encoded)
        loss = ctc_loss(
            log_probs=log_probs,
            targets=transcript,
            input_length=encoded_len,
            target_length=transcript_len,
        )
        predictions = greedy_decoder(log_probs=log_probs)

        from nemo.collections.asr.helpers import (
            process_evaluation_batch,
            process_evaluation_epoch,
        )

        eval_callback = nemo.core.EvaluatorCallback(
            eval_tensors=[loss, predictions, transcript, transcript_len],
            user_iter_callback=lambda x, y: process_evaluation_batch(
                x, y, labels=self.labels),
            user_epochs_done_callback=process_evaluation_epoch,
        )
        # Instantiate an optimizer to perform `train` action
        self.nf.eval(callbacks=[eval_callback])
Ejemplo n.º 25
0
def create_dag(args, cfg, logger, num_gpus):
    # Defining nodes
    data = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.train_dataset,
        labels=cfg['target']['labels'],
        batch_size=cfg['optimization']['batch_size'],
        eos_id=cfg['target']['eos_id'],
        **cfg['AudioToTextDataLayer']['train'],
    )
    data_evals = []
    if args.eval_datasets:
        for val_path in args.eval_datasets:
            data_evals.append(
                nemo_asr.AudioToTextDataLayer(
                    manifest_filepath=val_path,
                    labels=cfg['target']['labels'],
                    batch_size=cfg['inference']['batch_size'],
                    eos_id=cfg['target']['eos_id'],
                    **cfg['AudioToTextDataLayer']['eval'],
                ))
    else:
        logger.info("There were no val datasets passed")
    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        **cfg['AudioToMelSpectrogramPreprocessor'])
    data_augmentation = nemo_asr.SpectrogramAugmentation(
        **cfg['SpectrogramAugmentation'])
    encoder = nemo_asr.JasperEncoder(
        feat_in=cfg["AudioToMelSpectrogramPreprocessor"]["features"],
        **cfg['JasperEncoder'],
    )
    if args.encoder_checkpoint is not None and os.path.exists(
            args.encoder_checkpoint):
        if cfg['JasperEncoder']['load']:
            encoder.restore_from(args.encoder_checkpoint, args.local_rank)
            logger.info(f'Loaded weights for encoder'
                        f' from {args.encoder_checkpoint}')
        if cfg['JasperEncoder']['freeze']:
            encoder.freeze()
            logger.info(f'Freeze encoder weights')
    connector = nemo_asr.JasperRNNConnector(
        in_channels=cfg['JasperEncoder']['jasper'][-1]['filters'],
        out_channels=cfg['DecoderRNN']['hidden_size'],
    )
    decoder = nemo.backends.pytorch.DecoderRNN(
        voc_size=len(cfg['target']['labels']),
        bos_id=cfg['target']['bos_id'],
        **cfg['DecoderRNN'],
    )
    if args.decoder_checkpoint is not None and os.path.exists(
            args.decoder_checkpoint):
        if cfg['DecoderRNN']['load']:
            decoder.restore_from(args.decoder_checkpoint, args.local_rank)
            logger.info(f'Loaded weights for decoder'
                        f' from {args.decoder_checkpoint}')
        if cfg['DecoderRNN']['freeze']:
            decoder.freeze()
            logger.info(f'Freeze decoder weights')
            if cfg['decoder']['unfreeze_attn']:
                for name, param in decoder.attention.named_parameters():
                    param.requires_grad = True
                logger.info(f'Unfreeze decoder attn weights')
    num_data = len(data)
    batch_size = cfg['optimization']['batch_size']
    num_epochs = cfg['optimization']['params']['num_epochs']
    steps_per_epoch = int(num_data / (batch_size * num_gpus))
    total_steps = num_epochs * steps_per_epoch
    vsc = ValueSetterCallback
    tf_callback = ValueSetterCallback(
        decoder,
        'teacher_forcing',
        policies=[vsc.Policy(vsc.Method.Const(1.0), start=0.0, end=1.0)],
        total_steps=total_steps,
    )
    seq_loss = nemo.backends.pytorch.SequenceLoss(
        pad_id=cfg['target']['pad_id'],
        smoothing_coef=cfg['optimization']['smoothing_coef'],
        sample_wise=cfg['optimization']['sample_wise'],
    )
    se_callback = ValueSetterCallback(
        seq_loss,
        'smoothing_coef',
        policies=[
            vsc.Policy(vsc.Method.Const(seq_loss.smoothing_coef),
                       start=0.0,
                       end=1.0),
        ],
        total_steps=total_steps,
    )
    beam_search = nemo.backends.pytorch.BeamSearch(
        decoder=decoder,
        pad_id=cfg['target']['pad_id'],
        bos_id=cfg['target']['bos_id'],
        eos_id=cfg['target']['eos_id'],
        max_len=cfg['target']['max_len'],
        beam_size=cfg['inference']['beam_size'],
    )
    uf_callback = UnfreezeCallback(
        [encoder, decoder], start_epoch=cfg['optimization']['start_unfreeze'])
    saver_callback = nemo.core.ModuleSaverCallback(
        save_modules_list=[encoder, connector, decoder],
        folder=args.checkpoint_dir,
        step_freq=args.eval_freq,
    )

    # Creating DAG
    audios, audio_lens, transcripts, _ = data()
    processed_audios, processed_audio_lens = data_preprocessor(
        input_signal=audios, length=audio_lens)
    augmented_spec = data_augmentation(input_spec=processed_audios)
    encoded, _ = encoder(audio_signal=augmented_spec,
                         length=processed_audio_lens)
    encoded = connector(tensor=encoded)
    log_probs, _ = decoder(targets=transcripts, encoder_outputs=encoded)
    train_loss = seq_loss(log_probs=log_probs, targets=transcripts)
    evals = []
    for i, data_eval in enumerate(data_evals):
        audios, audio_lens, transcripts, _ = data_eval()
        processed_audios, processed_audio_lens = data_preprocessor(
            input_signal=audios, length=audio_lens)
        encoded, _ = encoder(audio_signal=processed_audios,
                             length=processed_audio_lens)
        encoded = connector(tensor=encoded)
        log_probs, _ = decoder(targets=transcripts, encoder_outputs=encoded)
        loss = seq_loss(log_probs=log_probs, targets=transcripts)
        predictions, aw = beam_search(encoder_outputs=encoded)
        evals.append((
            args.eval_datasets[i],
            (loss, log_probs, transcripts, predictions, aw),
        ))

    # Update config
    cfg['num_params'] = {
        'encoder': encoder.num_weights,
        'connector': connector.num_weights,
        'decoder': decoder.num_weights,
    }
    cfg['num_params']['total'] = sum(cfg['num_params'].values())
    cfg['input']['train'] = {'num_data': num_data}
    cfg['optimization']['steps_per_epoch'] = steps_per_epoch
    cfg['optimization']['total_steps'] = total_steps

    return (
        (train_loss, evals),
        cfg,
        [tf_callback, se_callback, uf_callback, saver_callback],
    )
Ejemplo n.º 26
0
def main():
    parser = argparse.ArgumentParser(description='Jasper')
    parser.add_argument("--local_rank", default=None, type=int)
    parser.add_argument("--batch_size", default=32, type=int)
    parser.add_argument("--model_config", type=str, required=True)
    parser.add_argument("--eval_datasets", type=str, required=True)
    parser.add_argument("--load_dir", type=str, required=True)
    parser.add_argument("--vocab_file", type=str, required=True)
    parser.add_argument("--save_logprob", default=None, type=str)
    parser.add_argument("--lm_path", default=None, type=str)
    parser.add_argument("--beam_width", default=50, type=int)
    parser.add_argument("--alpha", default=2.0, type=float)
    parser.add_argument("--beta", default=1.0, type=float)
    parser.add_argument("--cutoff_prob", default=0.99, type=float)
    parser.add_argument("--cutoff_top_n", default=40, type=int)

    args = parser.parse_args()
    batch_size = args.batch_size
    load_dir = args.load_dir

    if args.local_rank is not None:
        if args.lm_path:
            raise NotImplementedError(
                "Beam search decoder with LM does not currently support evaluation on multi-gpu."
            )
        device = nemo.core.DeviceType.AllGpu
    else:
        device = nemo.core.DeviceType.GPU

    # Instantiate Neural Factory with supported backend
    neural_factory = nemo.core.NeuralModuleFactory(
        backend=nemo.core.Backend.PyTorch,
        local_rank=args.local_rank,
        optimization_level=nemo.core.Optimization.mxprO1,
        placement=device,
    )

    if args.local_rank is not None:
        logging.info('Doing ALL GPU')

    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)

    vocab = load_vocab(args.vocab_file)

    sample_rate = jasper_params['sample_rate']

    eval_datasets = args.eval_datasets

    eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"])
    eval_dl_params["normalize_transcripts"] = False
    del eval_dl_params["train"]
    del eval_dl_params["eval"]
    data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=eval_datasets,
        sample_rate=sample_rate,
        labels=vocab,
        batch_size=batch_size,
        **eval_dl_params,
    )

    n = len(data_layer)
    logging.info('Evaluating {0} examples'.format(n))

    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        sample_rate=sample_rate,
        **jasper_params["AudioToMelSpectrogramPreprocessor"],
    )
    jasper_encoder = nemo_asr.JasperEncoder(
        feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"],
        **jasper_params["JasperEncoder"],
    )
    jasper_decoder = nemo_asr.JasperDecoderForCTC(
        feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"],
        num_classes=len(vocab),
    )
    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    if args.lm_path:
        beam_width = args.beam_width
        alpha = args.alpha
        beta = args.beta
        cutoff_prob = args.cutoff_prob
        cutoff_top_n = args.cutoff_top_n
        beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM(
            vocab=vocab,
            beam_width=beam_width,
            alpha=alpha,
            beta=beta,
            cutoff_prob=cutoff_prob,
            cutoff_top_n=cutoff_top_n,
            lm_path=args.lm_path,
            num_cpus=max(os.cpu_count(), 1),
        )

    logging.info('================================')
    logging.info(
        f"Number of parameters in encoder: {jasper_encoder.num_weights}")
    logging.info(
        f"Number of parameters in decoder: {jasper_decoder.num_weights}")
    logging.info(f"Total number of parameters in model: "
                 f"{jasper_decoder.num_weights + jasper_encoder.num_weights}")
    logging.info('================================')

    (
        audio_signal_e1,
        a_sig_length_e1,
        transcript_e1,
        transcript_len_e1,
    ) = data_layer()
    processed_signal_e1, p_length_e1 = data_preprocessor(
        input_signal=audio_signal_e1, length=a_sig_length_e1)
    encoded_e1, encoded_len_e1 = jasper_encoder(
        audio_signal=processed_signal_e1, length=p_length_e1)
    log_probs_e1 = jasper_decoder(encoder_output=encoded_e1)
    predictions_e1 = greedy_decoder(log_probs=log_probs_e1)

    eval_tensors = [
        log_probs_e1,
        predictions_e1,
        transcript_e1,
        transcript_len_e1,
        encoded_len_e1,
    ]

    if args.lm_path:
        beam_predictions_e1 = beam_search_with_lm(
            log_probs=log_probs_e1, log_probs_length=encoded_len_e1)
        eval_tensors.append(beam_predictions_e1)

    evaluated_tensors = neural_factory.infer(
        tensors=eval_tensors,
        checkpoint_dir=load_dir,
    )

    greedy_hypotheses = post_process_predictions(evaluated_tensors[1], vocab)
    references = post_process_transcripts(evaluated_tensors[2],
                                          evaluated_tensors[3], vocab)
    cer = word_error_rate(hypotheses=greedy_hypotheses,
                          references=references,
                          use_cer=True)
    logging.info("Greedy CER {:.2f}%".format(cer * 100))

    if args.lm_path:
        beam_hypotheses = []
        # Over mini-batch
        for i in evaluated_tensors[-1]:
            # Over samples
            for j in i:
                beam_hypotheses.append(j[0][1])

        cer = word_error_rate(hypotheses=beam_hypotheses,
                              references=references,
                              use_cer=True)
        logging.info("Beam CER {:.2f}".format(cer * 100))

    if args.save_logprob:
        # Convert logits to list of numpy arrays
        logprob = []
        for i, batch in enumerate(evaluated_tensors[0]):
            for j in range(batch.shape[0]):
                logprob.append(
                    batch[j][:evaluated_tensors[4][i][j], :].cpu().numpy())
        with open(args.save_logprob, 'wb') as f:
            pickle.dump(logprob, f, protocol=pickle.HIGHEST_PROTOCOL)
Ejemplo n.º 27
0
    def test_fastspeech(self):
        neural_factory = nemo.core.NeuralModuleFactory(
            backend=nemo.core.Backend.PyTorch,
            local_rank=None,
            create_tb_writer=False,
        )

        data_layer = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=1,
            shuffle=False,
            sample_rate=16000,
        )

        data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
            window_size=None,
            window_stride=None,
            n_window_size=512,
            n_window_stride=128,
            normalize=None,
            preemph=None,
            dither=0,
            mag_power=1.0,
            pad_value=-11.52,
            pad_to=0,
        )

        data = data_layer()
        spec, spec_length = data_preprocessor(input_signal=data.audio_signal,
                                              length=data.a_sig_length)

        # Creates and saves durations as numpy arrays.
        durs_dir = pathlib.Path('tests/data/asr/durs')
        durs_dir.mkdir(exist_ok=True)
        result = neural_factory.infer(
            [data.transcripts, data.transcript_length, spec_length, spec])
        k = -1
        for text, text_len, mel_len, mel in zip(result[0], result[1],
                                                result[2], result[3]):
            text = text.cpu().numpy()[0][:text_len.cpu().numpy()[0]]
            dur = np.zeros(text.shape[0], dtype=np.long)
            dur_sum = mel_len.cpu().numpy()[0] + 1  # TODO: delete `+1`
            dur[0] = dur_sum - 4
            dur[1] = 4
            k += 1
            np.save(durs_dir / f'{k}.npy', dur, allow_pickle=False)

        data_layer = nemo_tts.FastSpeechDataLayer(
            manifest_filepath=self.manifest_filepath,
            durs_dir=durs_dir,
            labels=self.labels,
            batch_size=4,
            sample_rate=16000,
        )

        fastspeech = nemo_tts.FastSpeech(
            decoder_output_size=384,
            n_mels=64,
            max_seq_len=2048,
            word_vec_dim=384,
            encoder_n_layer=6,
            encoder_head=2,
            encoder_conv1d_filter_size=1536,
            decoder_n_layer=6,
            decoder_head=2,
            decoder_conv1d_filter_size=1536,
            fft_conv1d_kernel=3,
            fft_conv1d_padding=1,
            encoder_output_size=384,
            duration_predictor_filter_size=256,
            duration_predictor_kernel_size=3,
            dropout=0.1,
            alpha=1.0,
            n_src_vocab=len(self.labels),
            pad_id=0,
        )

        loss = nemo_tts.FastSpeechLoss()

        data = data_layer()
        mel_true, _ = data_preprocessor(input_signal=data.audio,
                                        length=data.audio_len)
        mel_pred, dur_pred = fastspeech(
            text=data.text,
            text_pos=data.text_pos,
            mel_true=mel_true,
            dur_true=data.dur_true,
        )
        loss_t = loss(
            mel_true=mel_true,
            mel_pred=mel_pred,
            dur_true=data.dur_true,
            dur_pred=dur_pred,
            text_pos=data.text_pos,
        )

        callback = nemo.core.SimpleLossLoggerCallback(
            tensors=[loss_t],
            print_func=lambda x: logging.info(f'Train Loss: {str(x[0].item())}'
                                              ),
        )
        optimizer = neural_factory.get_trainer()
        optimizer.train(
            [loss_t],
            callbacks=[callback],
            optimizer="sgd",
            optimization_params={
                "num_epochs": 3,
                "lr": 0.0003
            },
        )
Ejemplo n.º 28
0
def create_train_dag(
    neural_factory,
    neural_modules,
    tacotron2_params,
    train_dataset,
    batch_size,
    log_freq,
    checkpoint_save_freq,
    cpu_per_dl=1,
):
    (data_preprocessor, text_embedding, t2_enc, t2_dec, t2_postnet, t2_loss,
     makegatetarget) = neural_modules

    train_dl_params = copy.deepcopy(tacotron2_params["AudioToTextDataLayer"])
    train_dl_params.update(tacotron2_params["AudioToTextDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]

    data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=train_dataset,
        labels=tacotron2_params['labels'],
        bos_id=len(tacotron2_params['labels']),
        eos_id=len(tacotron2_params['labels']) + 1,
        pad_id=len(tacotron2_params['labels']) + 2,
        batch_size=batch_size,
        num_workers=cpu_per_dl,
        **train_dl_params,
    )

    N = len(data_layer)
    steps_per_epoch = math.ceil(N / (batch_size * neural_factory.world_size))
    logging.info(f'Have {N} examples to train on.')

    # Train DAG
    audio, audio_len, transcript, transcript_len = data_layer()
    spec_target, spec_target_len = data_preprocessor(input_signal=audio,
                                                     length=audio_len)

    transcript_embedded = text_embedding(char_phone=transcript)
    transcript_encoded = t2_enc(char_phone_embeddings=transcript_embedded,
                                embedding_length=transcript_len)
    mel_decoder, gate, alignments = t2_dec(
        char_phone_encoded=transcript_encoded,
        encoded_length=transcript_len,
        mel_target=spec_target,
    )
    mel_postnet = t2_postnet(mel_input=mel_decoder)
    gate_target = makegatetarget(mel_target=spec_target,
                                 target_len=spec_target_len)
    loss_t = t2_loss(
        mel_out=mel_decoder,
        mel_out_postnet=mel_postnet,
        gate_out=gate,
        mel_target=spec_target,
        gate_target=gate_target,
        target_len=spec_target_len,
        seq_len=audio_len,
    )

    # Callbacks needed to print info to console and Tensorboard
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[
            loss_t, spec_target, mel_postnet, gate, gate_target, alignments
        ],
        print_func=lambda x: logging.info(f"Loss: {x[0].data}"),
        log_to_tb_func=partial(tacotron2_log_to_tb_func,
                               log_images=True,
                               log_images_freq=log_freq),
        tb_writer=neural_factory.tb_writer,
    )

    chpt_callback = nemo.core.CheckpointCallback(
        folder=neural_factory.checkpoint_dir, step_freq=checkpoint_save_freq)

    callbacks = [train_callback, chpt_callback]
    return loss_t, callbacks, steps_per_epoch
Ejemplo n.º 29
0
def create_all_dags(args, neural_factory):
    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)
    vocab = load_vocab(args.vocab_file)
    sample_rate = jasper_params['sample_rate']

    # Calculate num_workers for dataloader
    total_cpus = os.cpu_count()
    cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1)

    # perturb_config = jasper_params.get('perturb', None)
    train_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    train_dl_params.update(jasper_params["AudioToTextDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]
    train_dl_params["normalize_transcripts"] = False
    data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.train_dataset,
        sample_rate=sample_rate,
        labels=vocab,
        batch_size=args.batch_size,
        num_workers=cpu_per_traindl,
        **train_dl_params,
        # normalize_transcripts=False
    )

    N = len(data_layer)
    steps_per_epoch = int(N / (args.batch_size * args.num_gpus))
    nemo.logging.info('Have {0} examples to train on.'.format(N))

    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        sample_rate=sample_rate,
        **jasper_params["AudioToMelSpectrogramPreprocessor"],
    )

    multiply_batch_config = jasper_params.get('MultiplyBatch', None)
    if multiply_batch_config:
        multiply_batch = nemo_asr.MultiplyBatch(**multiply_batch_config)

    spectr_augment_config = jasper_params.get('SpectrogramAugmentation', None)
    if spectr_augment_config:
        data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(
            **spectr_augment_config)

    eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"])
    eval_dl_params["normalize_transcripts"] = False
    del eval_dl_params["train"]
    del eval_dl_params["eval"]
    data_layers_eval = []

    if args.eval_datasets:
        for eval_datasets in args.eval_datasets:
            data_layer_eval = nemo_asr.AudioToTextDataLayer(
                manifest_filepath=eval_datasets,
                sample_rate=sample_rate,
                labels=vocab,
                batch_size=args.eval_batch_size,
                num_workers=cpu_per_traindl,
                **eval_dl_params,
            )

            data_layers_eval.append(data_layer_eval)
    else:
        nemo.logging.warning("There were no val datasets passed")

    jasper_encoder = nemo_asr.JasperEncoder(
        feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"],
        **jasper_params["JasperEncoder"],
    )

    jasper_decoder = nemo_asr.JasperDecoderForCTC(
        feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"],
        num_classes=len(vocab))

    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))

    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    nemo.logging.info('================================')
    nemo.logging.info(
        f"Number of parameters in encoder: {jasper_encoder.num_weights}")
    nemo.logging.info(
        f"Number of parameters in decoder: {jasper_decoder.num_weights}")
    nemo.logging.info(
        f"Total number of parameters in model: "
        f"{jasper_decoder.num_weights + jasper_encoder.num_weights}")
    nemo.logging.info('================================')

    # Train DAG
    (
        audio_signal_t,
        a_sig_length_t,
        transcript_t,
        transcript_len_t,
    ) = data_layer()
    processed_signal_t, p_length_t = data_preprocessor(
        input_signal=audio_signal_t, length=a_sig_length_t)

    if multiply_batch_config:
        (
            processed_signal_t,
            p_length_t,
            transcript_t,
            transcript_len_t,
        ) = multiply_batch(
            in_x=processed_signal_t,
            in_x_len=p_length_t,
            in_y=transcript_t,
            in_y_len=transcript_len_t,
        )

    if spectr_augment_config:
        processed_signal_t = data_spectr_augmentation(
            input_spec=processed_signal_t)

    encoded_t, encoded_len_t = jasper_encoder(audio_signal=processed_signal_t,
                                              length=p_length_t)
    log_probs_t = jasper_decoder(encoder_output=encoded_t)
    predictions_t = greedy_decoder(log_probs=log_probs_t)
    loss_t = ctc_loss(
        log_probs=log_probs_t,
        targets=transcript_t,
        input_length=encoded_len_t,
        target_length=transcript_len_t,
    )

    # Callbacks needed to print info to console and Tensorboard
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[loss_t, predictions_t, transcript_t, transcript_len_t],
        print_func=partial(monitor_asr_train_progress,
                           labels=vocab,
                           eval_metric='CER'),
        step_freq=args.train_eval_freq,
        get_tb_values=lambda x: [("loss", x[0])],
        tb_writer=neural_factory.tb_writer,
    )

    chpt_callback = nemo.core.CheckpointCallback(
        folder=neural_factory.checkpoint_dir,
        step_freq=args.checkpoint_save_freq,
    )

    callbacks = [train_callback, chpt_callback]

    # assemble eval DAGs
    for i, eval_dl in enumerate(data_layers_eval):
        (
            audio_signal_e,
            a_sig_length_e,
            transcript_e,
            transcript_len_e,
        ) = eval_dl()
        processed_signal_e, p_length_e = data_preprocessor(
            input_signal=audio_signal_e, length=a_sig_length_e)
        encoded_e, encoded_len_e = jasper_encoder(
            audio_signal=processed_signal_e, length=p_length_e)
        log_probs_e = jasper_decoder(encoder_output=encoded_e)
        predictions_e = greedy_decoder(log_probs=log_probs_e)
        loss_e = ctc_loss(
            log_probs=log_probs_e,
            targets=transcript_e,
            input_length=encoded_len_e,
            target_length=transcript_len_e,
        )

        # create corresponding eval callback
        tagname = os.path.basename(args.eval_datasets[i]).split(".")[0]
        eval_callback = nemo.core.EvaluatorCallback(
            eval_tensors=[
                loss_e,
                predictions_e,
                transcript_e,
                transcript_len_e,
            ],
            user_iter_callback=partial(process_evaluation_batch, labels=vocab),
            user_epochs_done_callback=partial(process_evaluation_epoch,
                                              eval_metric='CER',
                                              tag=tagname),
            eval_step=args.eval_freq,
            tb_writer=neural_factory.tb_writer,
        )

        callbacks.append(eval_callback)
    return loss_t, callbacks, steps_per_epoch
Ejemplo n.º 30
0
def create_eval_dags(
    neural_factory,
    neural_modules,
    tacotron2_params,
    eval_datasets,
    eval_batch_size,
    eval_freq,
    cpu_per_dl=1,
):
    (data_preprocessor, text_embedding, t2_enc, t2_dec, t2_postnet, t2_loss,
     makegatetarget) = neural_modules

    eval_dl_params = copy.deepcopy(tacotron2_params["AudioToTextDataLayer"])
    eval_dl_params.update(tacotron2_params["AudioToTextDataLayer"]["eval"])
    del eval_dl_params["train"]
    del eval_dl_params["eval"]

    callbacks = []
    # assemble eval DAGs
    for eval_dataset in eval_datasets:
        data_layer_eval = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=eval_dataset,
            labels=tacotron2_params['labels'],
            bos_id=len(tacotron2_params['labels']),
            eos_id=len(tacotron2_params['labels']) + 1,
            pad_id=len(tacotron2_params['labels']) + 2,
            batch_size=eval_batch_size,
            num_workers=cpu_per_dl,
            **eval_dl_params,
        )

        audio, audio_len, transcript, transcript_len = data_layer_eval()
        spec_target, spec_target_len = data_preprocessor(input_signal=audio,
                                                         length=audio_len)

        transcript_embedded = text_embedding(char_phone=transcript)
        transcript_encoded = t2_enc(char_phone_embeddings=transcript_embedded,
                                    embedding_length=transcript_len)
        mel_decoder, gate, alignments = t2_dec(
            char_phone_encoded=transcript_encoded,
            encoded_length=transcript_len,
            mel_target=spec_target,
        )
        mel_postnet = t2_postnet(mel_input=mel_decoder)
        gate_target = makegatetarget(mel_target=spec_target,
                                     target_len=spec_target_len)
        loss = t2_loss(
            mel_out=mel_decoder,
            mel_out_postnet=mel_postnet,
            gate_out=gate,
            mel_target=spec_target,
            gate_target=gate_target,
            target_len=spec_target_len,
            seq_len=audio_len,
        )

        # create corresponding eval callback
        tagname = os.path.basename(eval_dataset).split(".")[0]
        eval_tensors = [
            loss,
            spec_target,
            mel_postnet,
            gate,
            gate_target,
            alignments,
        ]
        eval_callback = nemo.core.EvaluatorCallback(
            eval_tensors=eval_tensors,
            user_iter_callback=tacotron2_process_eval_batch,
            user_epochs_done_callback=partial(tacotron2_process_final_eval,
                                              tag=tagname),
            tb_writer_func=partial(tacotron2_eval_log_to_tb_func, tag=tagname),
            eval_step=eval_freq,
            tb_writer=neural_factory.tb_writer,
        )

        callbacks.append(eval_callback)
    return callbacks