Exemple #1
0
        def wrong():
            with open("tests/data/jasper_smaller.yaml") as file:
                jasper_config = self.yaml.load(file)
            labels = jasper_config['labels']

            data_layer = nemo_asr.AudioToTextDataLayer(
                manifest_filepath=self.manifest_filepath,
                labels=labels,
                batch_size=4,
            )
            data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
                **jasper_config['AudioToMelSpectrogramPreprocessor'])
            jasper_encoder = nemo_asr.JasperEncoder(
                feat_in=jasper_config['AudioToMelSpectrogramPreprocessor']
                ['features'],
                **jasper_config['JasperEncoder'],
            )
            jasper_decoder = nemo_asr.JasperDecoderForCTC(
                feat_in=1024, num_classes=len(labels))
            # DAG definition
            (
                audio_signal,
                audio_signal_len,
                transcript,
                transcript_len,
            ) = data_layer()
            processed_signal, processed_signal_len = data_preprocessor(
                input_signal=audio_signal, length=audio_signal_len)

            spec_augment = nemo_asr.SpectrogramAugmentation(rect_masks=5)
            aug_signal = spec_augment(input_spec=processed_signal)

            encoded, encoded_len = jasper_encoder(audio_signal=aug_signal,
                                                  length=processed_signal_len)
            log_probs = jasper_decoder(encoder_output=processed_signal)
Exemple #2
0
 def test_jasper_decoder(self):
     j_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=33)
     self.__test_export_route_all(
         module=j_decoder,
         out_name="j_decoder",
         input_example=torch.randn(34, 1024, 1).cuda(),
     )
Exemple #3
0
 def test_jasper_decoder_export_ts(self):
     j_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=33)
     self.__test_export_route(
         module=j_decoder,
         out_name="j_decoder.ts",
         mode=nemo.core.DeploymentFormat.TORCHSCRIPT,
         input_example=None,
     )
Exemple #4
0
    def test_freeze_unfreeze_TrainableNM(self):
        path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../data/jasper_smaller.yaml"))
        with open(path) as file:
            jasper_model_definition = self.yaml.load(file)
        dl = nemo_asr.AudioToTextDataLayer(
            # featurizer_config=self.featurizer_config,
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=4,
        )
        pre_process_params = {
            #'int_values': False,
            'frame_splicing': 1,
            'features': 64,
            'window_size': 0.02,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
        }
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(**pre_process_params)
        jasper_encoder = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition['JasperEncoder'],
        )
        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(self.labels))
        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))
        jasper_encoder.freeze()
        jasper_encoder.unfreeze(set(['encoder.4.mconv.0.conv.weight']))
        frozen_weight = jasper_encoder.encoder[1].mconv[0].conv.weight.detach().cpu().numpy()
        unfrozen_weight = jasper_encoder.encoder[4].mconv[0].conv.weight.detach().cpu().numpy()
        # jasper_decoder.unfreeze()
        # DAG
        audio_signal, a_sig_length, transcript, transcript_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length)

        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length)
        # logging.info(jasper_encoder)
        log_probs = jasper_decoder(encoder_output=encoded)
        loss = ctc_loss(
            log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,
        )

        callback = nemo.core.SimpleLossLoggerCallback(
            tensors=[loss], print_func=lambda x: logging.info(f'Train Loss: {str(x[0].item())}'),
        )
        optimizer = self.nf.get_trainer()
        optimizer.train(
            [loss], callbacks=[callback], optimizer="sgd", optimization_params={"max_steps": 5, "lr": 0.0003},
        )
        new_frozen_weight = jasper_encoder.encoder[1].mconv[0].conv.weight.data
        new_unfrozen_weight = jasper_encoder.encoder[4].mconv[0].conv.weight.data
        self.assertTrue(np.array_equal(frozen_weight, new_frozen_weight.detach().cpu().numpy()))
        self.assertFalse(np.array_equal(unfrozen_weight, new_unfrozen_weight.detach().cpu().numpy()))
Exemple #5
0
    def test_asr_with_zero_ds(self):
        logging.info("Testing ASR NMs with ZeroDS and without pre-processing")
        path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../data/jasper_smaller.yaml"))
        with open(path) as file:
            jasper_model_definition = self.yaml.load(file)

        dl = nemo.backends.pytorch.common.ZerosDataLayer(
            size=100,
            dtype=torch.FloatTensor,
            batch_size=4,
            output_ports={
                # "processed_signal": NeuralType(
                #    {
                #        0: AxisType(BatchTag),
                #        1: AxisType(SpectrogramSignalTag, dim=64),
                #        2: AxisType(ProcessedTimeTag, dim=64),
                #    }
                # ),
                # "processed_length": NeuralType({0: AxisType(BatchTag)}),
                # "transcript": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag, dim=64)}),
                # "transcript_length": NeuralType({0: AxisType(BatchTag)}),
                "processed_signal": NeuralType(
                    (AxisType(AxisKind.Batch), AxisType(AxisKind.Dimension, 64), AxisType(AxisKind.Time, 64)),
                    SpectrogramType(),
                ),
                "processed_length": NeuralType(tuple('B'), LengthsType()),
                "transcript": NeuralType((AxisType(AxisKind.Batch), AxisType(AxisKind.Time, 64)), LabelsType()),
                "transcript_length": NeuralType(tuple('B'), LengthsType()),
            },
        )

        jasper_encoder = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition["JasperEncoder"],
        )
        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(self.labels))
        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))

        # DAG
        processed_signal, p_length, transcript, transcript_len = dl()
        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length)
        # logging.info(jasper_encoder)
        log_probs = jasper_decoder(encoder_output=encoded)
        loss = ctc_loss(
            log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,
        )

        callback = nemo.core.SimpleLossLoggerCallback(
            tensors=[loss], print_func=lambda x: logging.info(f'Train Loss: {str(x[0].item())}'),
        )
        # Instantiate an optimizer to perform `train` action
        self.nf.train(
            [loss], callbacks=[callback], optimization_params={"num_epochs": 2, "lr": 0.0003}, optimizer="sgd",
        )
Exemple #6
0
    def test_freeze_unfreeze_TrainableNM(self):
        with open("tests/data/jasper_smaller.yaml") as file:
            jasper_model_definition = self.yaml.load(file)
        dl = nemo_asr.AudioToTextDataLayer(
            featurizer_config=self.featurizer_config,
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=4,
        )
        pre_process_params = {
            'int_values': False,
            'frame_splicing': 1,
            'features': 64,
            'window_size': 0.02,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
        }
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(**pre_process_params)
        jasper_encoder = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition['JasperEncoder'],
        )
        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(self.labels))
        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))
        jasper_encoder.freeze()
        jasper_encoder.unfreeze(set(['encoder.4.conv.1.weight']))
        jasper_decoder.unfreeze()
        # DAG
        audio_signal, a_sig_length, transcript, transcript_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length)

        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length)
        # print(jasper_encoder)
        log_probs = jasper_decoder(encoder_output=encoded)
        loss = ctc_loss(
            log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,
        )

        callback = nemo.core.SimpleLossLoggerCallback(
            tensors=[loss], print_func=lambda x: print(f'Train Loss: {str(x[0].item())}'),
        )
        # Instantiate an optimizer to perform `train` action
        neural_factory = nemo.core.NeuralModuleFactory(
            backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False,
        )
        optimizer = neural_factory.get_trainer()
        optimizer.train(
            [loss], callbacks=[callback], optimizer="sgd", optimization_params={"num_epochs": 2, "lr": 0.0003},
        )
Exemple #7
0
 def __init__(self,
              model_yaml,
              encoder_checkpoint,
              decoder_checkpoint,
              language_model=None):
     super(JasperASR, self).__init__()
     # Read model YAML
     yaml = YAML(typ="safe")
     with open(model_yaml) as f:
         jasper_model_definition = yaml.load(f)
     self.neural_factory = nemo.core.NeuralModuleFactory(
         placement=nemo.core.DeviceType.GPU,
         backend=nemo.core.Backend.PyTorch)
     self.labels = jasper_model_definition["labels"]
     self.data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor()
     self.jasper_encoder = nemo_asr.JasperEncoder(
         jasper=jasper_model_definition["JasperEncoder"]["jasper"],
         activation=jasper_model_definition["JasperEncoder"]["activation"],
         feat_in=jasper_model_definition[
             "AudioToMelSpectrogramPreprocessor"]["features"],
     )
     self.jasper_encoder.restore_from(encoder_checkpoint, local_rank=0)
     self.jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024,
                                                        num_classes=len(
                                                            self.labels))
     self.jasper_decoder.restore_from(decoder_checkpoint, local_rank=0)
     self.greedy_decoder = nemo_asr.GreedyCTCDecoder()
     self.beam_search_with_lm = None
     if language_model:
         self.beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM(
             vocab=self.labels,
             beam_width=64,
             alpha=2.0,
             beta=1.0,
             lm_path=language_model,
             num_cpus=max(os.cpu_count(), 1),
         )
Exemple #8
0
    def test_jasper_eval(self):
        with open(
                os.path.abspath(
                    os.path.join(os.path.dirname(__file__),
                                 "../data/jasper_smaller.yaml"))) as file:
            jasper_model_definition = self.yaml.load(file)
        dl = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=4,
        )
        pre_process_params = {
            'frame_splicing': 1,
            'features': 64,
            'window_size': 0.02,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
        }
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            **pre_process_params)
        jasper_encoder = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition[
                'AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition['JasperEncoder'],
        )
        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024,
                                                      num_classes=len(
                                                          self.labels))
        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))
        greedy_decoder = nemo_asr.GreedyCTCDecoder()
        # DAG
        audio_signal, a_sig_length, transcript, transcript_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal,
                                                   length=a_sig_length)

        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal,
                                              length=p_length)
        # logging.info(jasper_encoder)
        log_probs = jasper_decoder(encoder_output=encoded)
        loss = ctc_loss(
            log_probs=log_probs,
            targets=transcript,
            input_length=encoded_len,
            target_length=transcript_len,
        )
        predictions = greedy_decoder(log_probs=log_probs)

        from nemo.collections.asr.helpers import (
            process_evaluation_batch,
            process_evaluation_epoch,
        )

        eval_callback = nemo.core.EvaluatorCallback(
            eval_tensors=[loss, predictions, transcript, transcript_len],
            user_iter_callback=lambda x, y: process_evaluation_batch(
                x, y, labels=self.labels),
            user_epochs_done_callback=process_evaluation_epoch,
        )
        # Instantiate an optimizer to perform `train` action
        self.nf.eval(callbacks=[eval_callback])
Exemple #9
0
    def test_stft_conv(self):
        with open(
                os.path.abspath(
                    os.path.join(os.path.dirname(__file__),
                                 "../data/jasper_smaller.yaml"))) as file:
            jasper_model_definition = self.yaml.load(file)
        dl = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=4,
        )
        pre_process_params = {
            'frame_splicing': 1,
            'features': 64,
            'window_size': 0.02,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
            'stft_conv': True,
        }
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            **pre_process_params)
        jasper_encoder = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition[
                'AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition['JasperEncoder'],
        )
        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024,
                                                      num_classes=len(
                                                          self.labels))

        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))

        # DAG
        audio_signal, a_sig_length, transcript, transcript_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal,
                                                   length=a_sig_length)

        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal,
                                              length=p_length)
        # logging.info(jasper_encoder)
        log_probs = jasper_decoder(encoder_output=encoded)
        loss = ctc_loss(
            log_probs=log_probs,
            targets=transcript,
            input_length=encoded_len,
            target_length=transcript_len,
        )

        callback = nemo.core.SimpleLossLoggerCallback(
            tensors=[loss],
            print_func=lambda x: logging.info(str(x[0].item())))
        # Instantiate an optimizer to perform `train` action
        optimizer = self.nf.get_trainer()
        optimizer.train(
            [loss],
            callbacks=[callback],
            optimizer="sgd",
            optimization_params={
                "num_epochs": 10,
                "lr": 0.0003
            },
        )
Exemple #10
0
def create_all_dags(args, neural_factory):
    '''
    creates train and eval dags as well as their callbacks
    returns train loss tensor and callbacks'''

    # parse the config files
    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        quartz_params = yaml.load(f)

    try:
        vocab = quartz_params['labels']
        sample_rate = quartz_params['sample_rate']
    except KeyError:
        logging.error("Please make sure you are using older config format (the ones with -old suffix)")
        exit(1)

    # Calculate num_workers for dataloader
    total_cpus = os.cpu_count()
    cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1)

    # create data layer for training
    train_dl_params = copy.deepcopy(quartz_params["AudioToTextDataLayer"])
    train_dl_params.update(quartz_params["AudioToTextDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]
    # del train_dl_params["normalize_transcripts"]

    data_layer_train = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.train_dataset,
        sample_rate=sample_rate,
        labels=vocab,
        batch_size=args.batch_size,
        num_workers=cpu_per_traindl,
        **train_dl_params,
        # normalize_transcripts=False
    )

    N = len(data_layer_train)
    steps_per_epoch = int(N / (args.batch_size * args.iter_per_step * args.num_gpus))

    # create separate data layers for eval
    # we need separate eval dags for separate eval datasets
    # but all other modules in these dags will be shared

    eval_dl_params = copy.deepcopy(quartz_params["AudioToTextDataLayer"])
    eval_dl_params.update(quartz_params["AudioToTextDataLayer"]["eval"])
    del eval_dl_params["train"]
    del eval_dl_params["eval"]

    data_layers_eval = []
    if args.eval_datasets:
        for eval_dataset in args.eval_datasets:
            data_layer_eval = nemo_asr.AudioToTextDataLayer(
                manifest_filepath=eval_dataset,
                sample_rate=sample_rate,
                labels=vocab,
                batch_size=args.eval_batch_size,
                num_workers=cpu_per_traindl,
                **eval_dl_params,
            )

            data_layers_eval.append(data_layer_eval)
    else:
        logging.warning("There were no val datasets passed")

    # create shared modules

    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        sample_rate=sample_rate, **quartz_params["AudioToMelSpectrogramPreprocessor"],
    )

    # (QuartzNet uses the Jasper baseline encoder and decoder)
    encoder = nemo_asr.JasperEncoder(
        feat_in=quartz_params["AudioToMelSpectrogramPreprocessor"]["features"], **quartz_params["JasperEncoder"],
    )

    decoder = nemo_asr.JasperDecoderForCTC(
        feat_in=quartz_params["JasperEncoder"]["jasper"][-1]["filters"], num_classes=len(vocab),
    )

    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))

    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    # create augmentation modules (only used for training) if their configs
    # are present

    multiply_batch_config = quartz_params.get('MultiplyBatch', None)
    if multiply_batch_config:
        multiply_batch = nemo_asr.MultiplyBatch(**multiply_batch_config)

    spectr_augment_config = quartz_params.get('SpectrogramAugmentation', None)
    if spectr_augment_config:
        data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(**spectr_augment_config)

    # assemble train DAG

    (audio_signal_t, a_sig_length_t, transcript_t, transcript_len_t,) = data_layer_train()

    processed_signal_t, p_length_t = data_preprocessor(input_signal=audio_signal_t, length=a_sig_length_t)

    if multiply_batch_config:
        (processed_signal_t, p_length_t, transcript_t, transcript_len_t,) = multiply_batch(
            in_x=processed_signal_t, in_x_len=p_length_t, in_y=transcript_t, in_y_len=transcript_len_t,
        )

    if spectr_augment_config:
        processed_signal_t = data_spectr_augmentation(input_spec=processed_signal_t)

    encoded_t, encoded_len_t = encoder(audio_signal=processed_signal_t, length=p_length_t)
    log_probs_t = decoder(encoder_output=encoded_t)
    predictions_t = greedy_decoder(log_probs=log_probs_t)
    loss_t = ctc_loss(
        log_probs=log_probs_t, targets=transcript_t, input_length=encoded_len_t, target_length=transcript_len_t,
    )

    # create train callbacks
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[loss_t, predictions_t, transcript_t, transcript_len_t],
        print_func=partial(monitor_asr_train_progress, labels=vocab),
        get_tb_values=lambda x: [["loss", x[0]]],
        tb_writer=neural_factory.tb_writer,
    )

    callbacks = [train_callback]

    if args.checkpoint_dir or args.load_dir:
        chpt_callback = nemo.core.CheckpointCallback(
            folder=args.checkpoint_dir, load_from_folder=args.load_dir, step_freq=args.checkpoint_save_freq,
        )

        callbacks.append(chpt_callback)

    # assemble eval DAGs
    for i, eval_dl in enumerate(data_layers_eval):
        (audio_signal_e, a_sig_length_e, transcript_e, transcript_len_e,) = eval_dl()
        processed_signal_e, p_length_e = data_preprocessor(input_signal=audio_signal_e, length=a_sig_length_e)
        encoded_e, encoded_len_e = encoder(audio_signal=processed_signal_e, length=p_length_e)
        log_probs_e = decoder(encoder_output=encoded_e)
        predictions_e = greedy_decoder(log_probs=log_probs_e)
        loss_e = ctc_loss(
            log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e,
        )

        # create corresponding eval callback
        tagname = os.path.basename(args.eval_datasets[i]).split(".")[0]

        eval_callback = nemo.core.EvaluatorCallback(
            eval_tensors=[loss_e, predictions_e, transcript_e, transcript_len_e,],
            user_iter_callback=partial(process_evaluation_batch, labels=vocab),
            user_epochs_done_callback=partial(process_evaluation_epoch, tag=tagname),
            eval_step=args.eval_freq,
            tb_writer=neural_factory.tb_writer,
        )

        callbacks.append(eval_callback)

    return loss_t, callbacks, steps_per_epoch
Exemple #11
0
    def __init__(self, model_definition):
        self.model_definition = model_definition
        # some changes for streaming scenario
        self.model_definition['AudioToMelSpectrogramPreprocessor']['dither'] = 0
        self.model_definition['AudioToMelSpectrogramPreprocessor']['pad_to'] = 0
        # spectrogram normalization constants
        normalization = {}
        normalization['fixed_mean'] = [
            -14.95827016, -12.71798736, -11.76067913, -10.83311182,
            -10.6746914,  -10.15163465, -10.05378331, -9.53918999,
            -9.41858904,  -9.23382904,  -9.46470918,  -9.56037,
            -9.57434245,  -9.47498732,  -9.7635205,   -10.08113074,
            -10.05454561, -9.81112681,  -9.68673603,  -9.83652977,
            -9.90046248,  -9.85404766,  -9.92560366,  -9.95440354,
            -10.17162966, -9.90102482,  -9.47471025,  -9.54416855,
            -10.07109475, -9.98249912,  -9.74359465,  -9.55632283,
            -9.23399915,  -9.36487649,  -9.81791084,  -9.56799225,
            -9.70630899,  -9.85148006,  -9.8594418,   -10.01378735,
            -9.98505315,  -9.62016094,  -10.342285,   -10.41070709,
            -10.10687659, -10.14536695, -10.30828702, -10.23542833,
            -10.88546868, -11.31723646, -11.46087382, -11.54877829,
            -11.62400934, -11.92190509, -12.14063815, -11.65130117,
            -11.58308531, -12.22214663, -12.42927197, -12.58039805,
            -13.10098969, -13.14345864, -13.31835645, -14.47345634]

        normalization['fixed_std'] = [
            3.81402054, 4.12647781, 4.05007065, 3.87790987,
            3.74721178, 3.68377423, 3.69344,    3.54001005,
            3.59530412, 3.63752368, 3.62826417, 3.56488469,
            3.53740577, 3.68313898, 3.67138151, 3.55707266,
            3.54919572, 3.55721289, 3.56723346, 3.46029304,
            3.44119672, 3.49030548, 3.39328435, 3.28244406,
            3.28001423, 3.26744937, 3.46692348, 3.35378948,
            2.96330901, 2.97663111, 3.04575148, 2.89717604,
            2.95659301, 2.90181116, 2.7111687,  2.93041291,
            2.86647897, 2.73473181, 2.71495654, 2.75543763,
            2.79174615, 2.96076456, 2.57376336, 2.68789782,
            2.90930817, 2.90412004, 2.76187531, 2.89905006,
            2.65896173, 2.81032176, 2.87769857, 2.84665271,
            2.80863137, 2.80707634, 2.83752184, 3.01914511,
            2.92046439, 2.78461139, 2.90034605, 2.94599508,
            2.99099718, 3.0167554,  3.04649716, 2.94116777]
            
        self.model_definition['AudioToMelSpectrogramPreprocessor']['normalize'] = normalization
        self.neural_factory = nemo.core.NeuralModuleFactory(
            placement=nemo.core.DeviceType.GPU,
            backend=nemo.core.Backend.PyTorch)

        self.data_layer = AudioDataLayer(self.model_definition["sample_rate"])
        self. data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
            **self.model_definition['AudioToMelSpectrogramPreprocessor'])

        self.jasper_encoder = nemo_asr.JasperEncoder(
            feat_in=self.model_definition['AudioToMelSpectrogramPreprocessor']['features'],
            **self.model_definition['JasperEncoder'])

        self.jasper_decoder = nemo_asr.JasperDecoderForCTC(
            feat_in=self.model_definition['JasperEncoder']['jasper'][-1]['filters'],
            num_classes=len(model_definition['labels']))


        self.load_model(CHECKPOINT_ENCODER, CHECKPOINT_DECODER)
        self.create_dag()
Exemple #12
0
def create_dags(jasper_params, args, nf):
    vocab = jasper_params['labels']

    # build train and eval model
    train_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    train_dl_params.update(jasper_params["AudioToTextDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]

    data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.train_dataset,
        labels=vocab,
        batch_size=args.batch_size,
        **train_dl_params,
    )

    num_samples = len(data_layer)
    steps_per_epoch = math.ceil(
        num_samples / (args.batch_size * args.iter_per_step * nf.world_size))
    total_steps = steps_per_epoch * args.num_epochs
    logging.info("Train samples=", num_samples, "num_steps=", total_steps)

    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        **jasper_params["AudioToMelSpectrogramPreprocessor"])

    # data_augmentation = nemo_asr.SpectrogramAugmentation(
    #     **jasper_params['SpectrogramAugmentation']
    # )

    eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"])
    del eval_dl_params["train"]
    del eval_dl_params["eval"]

    data_layer_eval = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.eval_datasets,
        labels=vocab,
        batch_size=args.eval_batch_size,
        **eval_dl_params,
    )

    num_samples = len(data_layer_eval)
    logging.info(f"Eval samples={num_samples}")

    jasper_encoder = nemo_asr.JasperEncoder(**jasper_params["JasperEncoder"])

    jasper_decoder = nemo_asr.JasperDecoderForCTC(
        num_classes=len(vocab), **jasper_params["JasperDecoderForCTC"])

    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))

    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    # Training model
    audio, audio_len, transcript, transcript_len = data_layer()
    processed, processed_len = data_preprocessor(input_signal=audio,
                                                 length=audio_len)
    encoded, encoded_len = jasper_encoder(audio_signal=processed,
                                          length=processed_len)
    log_probs = jasper_decoder(encoder_output=encoded)
    predictions = greedy_decoder(log_probs=log_probs)
    loss = ctc_loss(
        log_probs=log_probs,
        targets=transcript,
        input_length=encoded_len,
        target_length=transcript_len,
    )

    # Evaluation model
    audio_e, audio_len_e, transcript_e, transcript_len_e = data_layer_eval()
    processed_e, processed_len_e = data_preprocessor(input_signal=audio_e,
                                                     length=audio_len_e)
    encoded_e, encoded_len_e = jasper_encoder(audio_signal=processed_e,
                                              length=processed_len_e)
    log_probs_e = jasper_decoder(encoder_output=encoded_e)
    predictions_e = greedy_decoder(log_probs=log_probs_e)
    loss_e = ctc_loss(
        log_probs=log_probs_e,
        targets=transcript_e,
        input_length=encoded_len_e,
        target_length=transcript_len_e,
    )
    logging.info("Num of params in encoder: {0}".format(
        jasper_encoder.num_weights))

    # Callbacks to print info to console and Tensorboard
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[loss, predictions, transcript, transcript_len],
        print_func=partial(monitor_asr_train_progress, labels=vocab),
        get_tb_values=lambda x: [["loss", x[0]]],
        tb_writer=nf.tb_writer,
    )

    checkpointer_callback = nemo.core.CheckpointCallback(
        folder=nf.checkpoint_dir, step_freq=args.checkpoint_save_freq)

    eval_tensors = [loss_e, predictions_e, transcript_e, transcript_len_e]
    eval_callback = nemo.core.EvaluatorCallback(
        eval_tensors=eval_tensors,
        user_iter_callback=partial(process_evaluation_batch, labels=vocab),
        user_epochs_done_callback=process_evaluation_epoch,
        eval_step=args.eval_freq,
        tb_writer=nf.tb_writer,
    )
    callbacks = [train_callback, checkpointer_callback, eval_callback]
    return (
        loss,
        eval_tensors,
        callbacks,
        total_steps,
        vocab,
        log_probs_e,
        encoded_len_e,
    )
Exemple #13
0
def main(
    config_file,
    nn_encoder,
    nn_decoder,
    nn_onnx_encoder,
    nn_onnx_decoder,
    pre_v09_model=False,
    batch_size=1,
    time_steps=256,
    decoder_type='ctc',
):
    yaml = YAML(typ="safe")

    logging.info("Loading config file...")
    with open(config_file) as f:
        jasper_model_definition = yaml.load(f)

    logging.info("Determining model shape...")
    num_encoder_input_features = 64
    decoder_params = jasper_model_definition['init_params']['decoder_params']['init_params']
    num_decoder_input_features = decoder_params['feat_in']
    logging.info("  Num encoder input features: {}".format(num_encoder_input_features))
    logging.info("  Num decoder input features: {}".format(num_decoder_input_features))

    nf = nemo.core.NeuralModuleFactory(create_tb_writer=False)

    logging.info("Initializing models...")
    jasper_encoder = nemo_asr.JasperEncoder(**jasper_model_definition['init_params']['encoder_params']['init_params'])

    if decoder_type == 'ctc':
        jasper_decoder = nemo_asr.JasperDecoderForCTC(
            feat_in=num_decoder_input_features,
            num_classes=decoder_params['num_classes'],
            vocabulary=decoder_params['vocabulary'],
        )
    elif decoder_type == 'classification':
        if 'labels' in jasper_model_definition:
            num_classes = len(jasper_model_definition['labels'])
        else:
            raise ValueError("List of class labels must be defined in model config file with key 'labels'")

        jasper_decoder = nemo_asr.JasperDecoderForClassification(
            feat_in=num_decoder_input_features, num_classes=num_classes
        )
    else:
        raise ValueError("`decoder_type` must be one of ['ctc', 'classification']")

    # This is necessary if you are using checkpoints trained with NeMo
    # version before 0.9
    logging.info("Loading checkpoints...")
    if pre_v09_model:
        logging.info("  Converting pre v0.9 checkpoint...")
        ckpt = torch.load(nn_encoder)
        new_ckpt = {}
        for k, v in ckpt.items():
            new_k = k.replace('.conv.', '.mconv.')
            if len(v.shape) == 3:
                new_k = new_k.replace('.weight', '.conv.weight')
            new_ckpt[new_k] = v
        jasper_encoder.load_state_dict(new_ckpt)
    else:
        jasper_encoder.restore_from(nn_encoder)
    jasper_decoder.restore_from(nn_decoder)

    # Create export directories if they don't already exist
    base_export_dir, export_fn = os.path.split(nn_onnx_encoder)
    if base_export_dir and not os.path.exists(base_export_dir):
        os.makedirs(base_export_dir)

    base_export_dir, export_fn = os.path.split(nn_onnx_decoder)
    if base_export_dir and not os.path.exists(base_export_dir):
        os.makedirs(base_export_dir)

    logging.info("Exporting encoder...")
    nf.deployment_export(
        jasper_encoder,
        nn_onnx_encoder,
        nemo.core.neural_factory.DeploymentFormat.ONNX,
        torch.zeros(batch_size, num_encoder_input_features, time_steps, dtype=torch.float, device="cuda:0",),
    )
    del jasper_encoder
    logging.info("Exporting decoder...")
    nf.deployment_export(
        jasper_decoder,
        nn_onnx_decoder,
        nemo.core.neural_factory.DeploymentFormat.ONNX,
        (torch.zeros(batch_size, num_decoder_input_features, time_steps // 2, dtype=torch.float, device="cuda:0",)),
    )
    del jasper_decoder
    logging.info("Export completed successfully.")
Exemple #14
0
def main():
    parser = argparse.ArgumentParser(description='Jasper')
    # model params
    parser.add_argument("--model_config", type=str, required=True)
    parser.add_argument("--eval_datasets", type=str, required=True)
    parser.add_argument("--load_dir", type=str, required=True)
    # run params
    parser.add_argument("--local_rank", default=None, type=int)
    parser.add_argument("--batch_size", default=64, type=int)
    parser.add_argument("--amp_opt_level", default="O1", type=str)
    # store results
    parser.add_argument("--save_logprob", default=None, type=str)

    # lm inference parameters
    parser.add_argument("--lm_path", default=None, type=str)
    parser.add_argument('--alpha',
                        default=2.0,
                        type=float,
                        help='value of LM weight',
                        required=False)
    parser.add_argument(
        '--alpha_max',
        type=float,
        help='maximum value of LM weight (for a grid search in \'eval\' mode)',
        required=False,
    )
    parser.add_argument('--alpha_step',
                        type=float,
                        help='step for LM weight\'s tuning in \'eval\' mode',
                        required=False,
                        default=0.1)
    parser.add_argument('--beta',
                        default=1.5,
                        type=float,
                        help='value of word count weight',
                        required=False)
    parser.add_argument(
        '--beta_max',
        type=float,
        help='maximum value of word count weight (for a grid search in \
          \'eval\' mode',
        required=False,
    )
    parser.add_argument(
        '--beta_step',
        type=float,
        help='step for word count weight\'s tuning in \'eval\' mode',
        required=False,
        default=0.1,
    )
    parser.add_argument("--beam_width", default=128, type=int)

    args = parser.parse_args()
    batch_size = args.batch_size
    load_dir = args.load_dir

    if args.local_rank is not None:
        if args.lm_path:
            raise NotImplementedError(
                "Beam search decoder with LM does not currently support evaluation on multi-gpu."
            )
        device = nemo.core.DeviceType.AllGpu
    else:
        device = nemo.core.DeviceType.GPU

    # Instantiate Neural Factory with supported backend
    neural_factory = nemo.core.NeuralModuleFactory(
        backend=nemo.core.Backend.PyTorch,
        local_rank=args.local_rank,
        optimization_level=args.amp_opt_level,
        placement=device,
    )

    if args.local_rank is not None:
        logging.info('Doing ALL GPU')

    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)
    vocab = jasper_params['labels']
    sample_rate = jasper_params['sample_rate']

    eval_datasets = args.eval_datasets

    eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"])
    del eval_dl_params["train"]
    del eval_dl_params["eval"]
    data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=eval_datasets,
        sample_rate=sample_rate,
        labels=vocab,
        batch_size=batch_size,
        **eval_dl_params,
    )

    N = len(data_layer)
    logging.info('Evaluating {0} examples'.format(N))

    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        sample_rate=sample_rate,
        **jasper_params["AudioToMelSpectrogramPreprocessor"])
    jasper_encoder = nemo_asr.JasperEncoder(
        feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"],
        **jasper_params["JasperEncoder"])
    jasper_decoder = nemo_asr.JasperDecoderForCTC(
        feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"],
        num_classes=len(vocab))
    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    logging.info('================================')
    logging.info(
        f"Number of parameters in encoder: {jasper_encoder.num_weights}")
    logging.info(
        f"Number of parameters in decoder: {jasper_decoder.num_weights}")
    logging.info(f"Total number of parameters in model: "
                 f"{jasper_decoder.num_weights + jasper_encoder.num_weights}")
    logging.info('================================')

    # Define inference DAG
    audio_signal_e1, a_sig_length_e1, transcript_e1, transcript_len_e1 = data_layer(
    )
    processed_signal_e1, p_length_e1 = data_preprocessor(
        input_signal=audio_signal_e1, length=a_sig_length_e1)
    encoded_e1, encoded_len_e1 = jasper_encoder(
        audio_signal=processed_signal_e1, length=p_length_e1)
    log_probs_e1 = jasper_decoder(encoder_output=encoded_e1)
    predictions_e1 = greedy_decoder(log_probs=log_probs_e1)

    eval_tensors = [
        log_probs_e1, predictions_e1, transcript_e1, transcript_len_e1,
        encoded_len_e1
    ]

    # inference
    evaluated_tensors = neural_factory.infer(tensors=eval_tensors,
                                             checkpoint_dir=load_dir)

    greedy_hypotheses = post_process_predictions(evaluated_tensors[1], vocab)
    references = post_process_transcripts(evaluated_tensors[2],
                                          evaluated_tensors[3], vocab)

    wer = word_error_rate(hypotheses=greedy_hypotheses, references=references)
    logging.info("Greedy WER {:.2f}%".format(wer * 100))

    # Convert logits to list of numpy arrays
    logprob = []
    for i, batch in enumerate(evaluated_tensors[0]):
        for j in range(batch.shape[0]):
            logprob.append(
                batch[j][:evaluated_tensors[4][i][j], :].cpu().numpy())
    if args.save_logprob:
        with open(args.save_logprob, 'wb') as f:
            pickle.dump(logprob, f, protocol=pickle.HIGHEST_PROTOCOL)

    # language model
    if args.lm_path:
        if args.alpha_max is None:
            args.alpha_max = args.alpha
        # include alpha_max in tuning range
        args.alpha_max += args.alpha_step / 10.0

        if args.beta_max is None:
            args.beta_max = args.beta
        # include beta_max in tuning range
        args.beta_max += args.beta_step / 10.0

        beam_wers = []

        logprobexp = [np.exp(p) for p in logprob]
        for alpha in np.arange(args.alpha, args.alpha_max, args.alpha_step):
            for beta in np.arange(args.beta, args.beta_max, args.beta_step):
                logging.info('================================')
                logging.info(f'Infering with (alpha, beta): ({alpha}, {beta})')
                beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM(
                    vocab=vocab,
                    beam_width=args.beam_width,
                    alpha=alpha,
                    beta=beta,
                    lm_path=args.lm_path,
                    num_cpus=max(os.cpu_count(), 1),
                    input_tensor=False,
                )

                beam_predictions = beam_search_with_lm(log_probs=logprobexp,
                                                       log_probs_length=None,
                                                       force_pt=True)

                beam_predictions = [b[0][1] for b in beam_predictions[0]]
                lm_wer = word_error_rate(hypotheses=beam_predictions,
                                         references=references)
                logging.info("Beam WER {:.2f}%".format(lm_wer * 100))
                beam_wers.append(((alpha, beta), lm_wer * 100))

        logging.info('Beam WER for (alpha, beta)')
        logging.info('================================')
        logging.info('\n' + '\n'.join([str(e) for e in beam_wers]))
        logging.info('================================')
        best_beam_wer = min(beam_wers, key=lambda x: x[1])
        logging.info('Best (alpha, beta): '
                     f'{best_beam_wer[0]}, '
                     f'WER: {best_beam_wer[1]:.2f}%')
labels = jasper_model_definition['labels']

# Instantiate necessary Neural Modules
# Note that data layer is missing from here
# neural_factory = nemo.core.NeuralModuleFactory(placement=nemo.core.DeviceType.GPU)
neural_factory = nemo.core.NeuralModuleFactory(
    placement=nemo.core.DeviceType.CPU)
data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor()
jasper_encoder = nemo_asr.JasperEncoder(
    jasper=jasper_model_definition['JasperEncoder']['jasper'],
    activation=jasper_model_definition['JasperEncoder']['activation'],
    feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']
    ['features'],
)
jasper_encoder.restore_from(CHECKPOINT_ENCODER, local_rank=0)
jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024,
                                              num_classes=len(labels))
jasper_decoder.restore_from(CHECKPOINT_DECODER, local_rank=0)
greedy_decoder = nemo_asr.GreedyCTCDecoder()

if ENABLE_NGRAM and os.path.isfile(LM_PATH):
    beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM(
        vocab=labels,
        beam_width=64,
        alpha=2.0,
        beta=1.0,
        lm_path=LM_PATH,
        num_cpus=max(os.cpu_count(), 1),
    )
else:
    logging.info("Beam search is not enabled")
Exemple #16
0
    def test_simple_dags(self):
        # module instantiation
        with open("tests/data/jasper_smaller.yaml") as file:
            jasper_model_definition = self.yaml.load(file)
        labels = jasper_model_definition['labels']

        data_layer = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=labels,
            batch_size=4,
        )
        data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
            **jasper_model_definition['AudioToMelSpectrogramPreprocessor'])
        jasper_encoder = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition[
                'AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition['JasperEncoder'],
        )
        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024,
                                                      num_classes=len(labels))
        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(labels))
        greedy_decoder = nemo_asr.GreedyCTCDecoder()

        # DAG definition
        (
            audio_signal,
            audio_signal_len,
            transcript,
            transcript_len,
        ) = data_layer()
        processed_signal, processed_signal_len = data_preprocessor(
            input_signal=audio_signal, length=audio_signal_len)

        spec_augment = nemo_asr.SpectrogramAugmentation(rect_masks=5)
        aug_signal = spec_augment(input_spec=processed_signal)

        encoded, encoded_len = jasper_encoder(audio_signal=aug_signal,
                                              length=processed_signal_len)
        log_probs = jasper_decoder(encoder_output=encoded)
        predictions = greedy_decoder(log_probs=log_probs)
        loss = ctc_loss(
            log_probs=log_probs,
            targets=transcript,
            input_length=encoded_len,
            target_length=transcript_len,
        )

        def wrong():
            with open("tests/data/jasper_smaller.yaml") as file:
                jasper_config = self.yaml.load(file)
            labels = jasper_config['labels']

            data_layer = nemo_asr.AudioToTextDataLayer(
                manifest_filepath=self.manifest_filepath,
                labels=labels,
                batch_size=4,
            )
            data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
                **jasper_config['AudioToMelSpectrogramPreprocessor'])
            jasper_encoder = nemo_asr.JasperEncoder(
                feat_in=jasper_config['AudioToMelSpectrogramPreprocessor']
                ['features'],
                **jasper_config['JasperEncoder'],
            )
            jasper_decoder = nemo_asr.JasperDecoderForCTC(
                feat_in=1024, num_classes=len(labels))
            # DAG definition
            (
                audio_signal,
                audio_signal_len,
                transcript,
                transcript_len,
            ) = data_layer()
            processed_signal, processed_signal_len = data_preprocessor(
                input_signal=audio_signal, length=audio_signal_len)

            spec_augment = nemo_asr.SpectrogramAugmentation(rect_masks=5)
            aug_signal = spec_augment(input_spec=processed_signal)

            encoded, encoded_len = jasper_encoder(audio_signal=aug_signal,
                                                  length=processed_signal_len)
            log_probs = jasper_decoder(encoder_output=processed_signal)

        self.assertRaises(NeuralPortNmTensorMismatchError, wrong)
Exemple #17
0
def create_all_dags(args, neural_factory):
    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)
    vocab = load_vocab(args.vocab_file)
    sample_rate = jasper_params['sample_rate']

    # Calculate num_workers for dataloader
    total_cpus = os.cpu_count()
    cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1)

    # perturb_config = jasper_params.get('perturb', None)
    train_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    train_dl_params.update(jasper_params["AudioToTextDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]
    train_dl_params["normalize_transcripts"] = False
    data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.train_dataset,
        sample_rate=sample_rate,
        labels=vocab,
        batch_size=args.batch_size,
        num_workers=cpu_per_traindl,
        **train_dl_params,
        # normalize_transcripts=False
    )

    N = len(data_layer)
    steps_per_epoch = int(N / (args.batch_size * args.num_gpus))
    nemo.logging.info('Have {0} examples to train on.'.format(N))

    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        sample_rate=sample_rate,
        **jasper_params["AudioToMelSpectrogramPreprocessor"],
    )

    multiply_batch_config = jasper_params.get('MultiplyBatch', None)
    if multiply_batch_config:
        multiply_batch = nemo_asr.MultiplyBatch(**multiply_batch_config)

    spectr_augment_config = jasper_params.get('SpectrogramAugmentation', None)
    if spectr_augment_config:
        data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(
            **spectr_augment_config)

    eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"])
    eval_dl_params["normalize_transcripts"] = False
    del eval_dl_params["train"]
    del eval_dl_params["eval"]
    data_layers_eval = []

    if args.eval_datasets:
        for eval_datasets in args.eval_datasets:
            data_layer_eval = nemo_asr.AudioToTextDataLayer(
                manifest_filepath=eval_datasets,
                sample_rate=sample_rate,
                labels=vocab,
                batch_size=args.eval_batch_size,
                num_workers=cpu_per_traindl,
                **eval_dl_params,
            )

            data_layers_eval.append(data_layer_eval)
    else:
        nemo.logging.warning("There were no val datasets passed")

    jasper_encoder = nemo_asr.JasperEncoder(
        feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"],
        **jasper_params["JasperEncoder"],
    )

    jasper_decoder = nemo_asr.JasperDecoderForCTC(
        feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"],
        num_classes=len(vocab))

    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))

    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    nemo.logging.info('================================')
    nemo.logging.info(
        f"Number of parameters in encoder: {jasper_encoder.num_weights}")
    nemo.logging.info(
        f"Number of parameters in decoder: {jasper_decoder.num_weights}")
    nemo.logging.info(
        f"Total number of parameters in model: "
        f"{jasper_decoder.num_weights + jasper_encoder.num_weights}")
    nemo.logging.info('================================')

    # Train DAG
    (
        audio_signal_t,
        a_sig_length_t,
        transcript_t,
        transcript_len_t,
    ) = data_layer()
    processed_signal_t, p_length_t = data_preprocessor(
        input_signal=audio_signal_t, length=a_sig_length_t)

    if multiply_batch_config:
        (
            processed_signal_t,
            p_length_t,
            transcript_t,
            transcript_len_t,
        ) = multiply_batch(
            in_x=processed_signal_t,
            in_x_len=p_length_t,
            in_y=transcript_t,
            in_y_len=transcript_len_t,
        )

    if spectr_augment_config:
        processed_signal_t = data_spectr_augmentation(
            input_spec=processed_signal_t)

    encoded_t, encoded_len_t = jasper_encoder(audio_signal=processed_signal_t,
                                              length=p_length_t)
    log_probs_t = jasper_decoder(encoder_output=encoded_t)
    predictions_t = greedy_decoder(log_probs=log_probs_t)
    loss_t = ctc_loss(
        log_probs=log_probs_t,
        targets=transcript_t,
        input_length=encoded_len_t,
        target_length=transcript_len_t,
    )

    # Callbacks needed to print info to console and Tensorboard
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[loss_t, predictions_t, transcript_t, transcript_len_t],
        print_func=partial(monitor_asr_train_progress,
                           labels=vocab,
                           eval_metric='CER'),
        step_freq=args.train_eval_freq,
        get_tb_values=lambda x: [("loss", x[0])],
        tb_writer=neural_factory.tb_writer,
    )

    chpt_callback = nemo.core.CheckpointCallback(
        folder=neural_factory.checkpoint_dir,
        step_freq=args.checkpoint_save_freq,
    )

    callbacks = [train_callback, chpt_callback]

    # assemble eval DAGs
    for i, eval_dl in enumerate(data_layers_eval):
        (
            audio_signal_e,
            a_sig_length_e,
            transcript_e,
            transcript_len_e,
        ) = eval_dl()
        processed_signal_e, p_length_e = data_preprocessor(
            input_signal=audio_signal_e, length=a_sig_length_e)
        encoded_e, encoded_len_e = jasper_encoder(
            audio_signal=processed_signal_e, length=p_length_e)
        log_probs_e = jasper_decoder(encoder_output=encoded_e)
        predictions_e = greedy_decoder(log_probs=log_probs_e)
        loss_e = ctc_loss(
            log_probs=log_probs_e,
            targets=transcript_e,
            input_length=encoded_len_e,
            target_length=transcript_len_e,
        )

        # create corresponding eval callback
        tagname = os.path.basename(args.eval_datasets[i]).split(".")[0]
        eval_callback = nemo.core.EvaluatorCallback(
            eval_tensors=[
                loss_e,
                predictions_e,
                transcript_e,
                transcript_len_e,
            ],
            user_iter_callback=partial(process_evaluation_batch, labels=vocab),
            user_epochs_done_callback=partial(process_evaluation_epoch,
                                              eval_metric='CER',
                                              tag=tagname),
            eval_step=args.eval_freq,
            tb_writer=neural_factory.tb_writer,
        )

        callbacks.append(eval_callback)
    return loss_t, callbacks, steps_per_epoch
def main():
    parser = argparse.ArgumentParser(description='Jasper')
    parser.add_argument("--local_rank", default=None, type=int)
    parser.add_argument("--batch_size", default=32, type=int)
    parser.add_argument("--model_config", type=str, required=True)
    parser.add_argument("--eval_datasets", type=str, required=True)
    parser.add_argument("--load_dir", type=str, required=True)
    parser.add_argument("--vocab_file", type=str, required=True)
    parser.add_argument("--save_logprob", default=None, type=str)
    parser.add_argument("--lm_path", default=None, type=str)
    parser.add_argument("--beam_width", default=50, type=int)
    parser.add_argument("--alpha", default=2.0, type=float)
    parser.add_argument("--beta", default=1.0, type=float)
    parser.add_argument("--cutoff_prob", default=0.99, type=float)
    parser.add_argument("--cutoff_top_n", default=40, type=int)

    args = parser.parse_args()
    batch_size = args.batch_size
    load_dir = args.load_dir

    if args.local_rank is not None:
        if args.lm_path:
            raise NotImplementedError(
                "Beam search decoder with LM does not currently support evaluation on multi-gpu."
            )
        device = nemo.core.DeviceType.AllGpu
    else:
        device = nemo.core.DeviceType.GPU

    # Instantiate Neural Factory with supported backend
    neural_factory = nemo.core.NeuralModuleFactory(
        backend=nemo.core.Backend.PyTorch,
        local_rank=args.local_rank,
        optimization_level=nemo.core.Optimization.mxprO1,
        placement=device,
    )

    if args.local_rank is not None:
        logging.info('Doing ALL GPU')

    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)

    vocab = load_vocab(args.vocab_file)

    sample_rate = jasper_params['sample_rate']

    eval_datasets = args.eval_datasets

    eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"])
    eval_dl_params["normalize_transcripts"] = False
    del eval_dl_params["train"]
    del eval_dl_params["eval"]
    data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=eval_datasets,
        sample_rate=sample_rate,
        labels=vocab,
        batch_size=batch_size,
        **eval_dl_params,
    )

    n = len(data_layer)
    logging.info('Evaluating {0} examples'.format(n))

    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        sample_rate=sample_rate,
        **jasper_params["AudioToMelSpectrogramPreprocessor"],
    )
    jasper_encoder = nemo_asr.JasperEncoder(
        feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"],
        **jasper_params["JasperEncoder"],
    )
    jasper_decoder = nemo_asr.JasperDecoderForCTC(
        feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"],
        num_classes=len(vocab),
    )
    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    if args.lm_path:
        beam_width = args.beam_width
        alpha = args.alpha
        beta = args.beta
        cutoff_prob = args.cutoff_prob
        cutoff_top_n = args.cutoff_top_n
        beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM(
            vocab=vocab,
            beam_width=beam_width,
            alpha=alpha,
            beta=beta,
            cutoff_prob=cutoff_prob,
            cutoff_top_n=cutoff_top_n,
            lm_path=args.lm_path,
            num_cpus=max(os.cpu_count(), 1),
        )

    logging.info('================================')
    logging.info(
        f"Number of parameters in encoder: {jasper_encoder.num_weights}")
    logging.info(
        f"Number of parameters in decoder: {jasper_decoder.num_weights}")
    logging.info(f"Total number of parameters in model: "
                 f"{jasper_decoder.num_weights + jasper_encoder.num_weights}")
    logging.info('================================')

    (
        audio_signal_e1,
        a_sig_length_e1,
        transcript_e1,
        transcript_len_e1,
    ) = data_layer()
    processed_signal_e1, p_length_e1 = data_preprocessor(
        input_signal=audio_signal_e1, length=a_sig_length_e1)
    encoded_e1, encoded_len_e1 = jasper_encoder(
        audio_signal=processed_signal_e1, length=p_length_e1)
    log_probs_e1 = jasper_decoder(encoder_output=encoded_e1)
    predictions_e1 = greedy_decoder(log_probs=log_probs_e1)

    eval_tensors = [
        log_probs_e1,
        predictions_e1,
        transcript_e1,
        transcript_len_e1,
        encoded_len_e1,
    ]

    if args.lm_path:
        beam_predictions_e1 = beam_search_with_lm(
            log_probs=log_probs_e1, log_probs_length=encoded_len_e1)
        eval_tensors.append(beam_predictions_e1)

    evaluated_tensors = neural_factory.infer(
        tensors=eval_tensors,
        checkpoint_dir=load_dir,
    )

    greedy_hypotheses = post_process_predictions(evaluated_tensors[1], vocab)
    references = post_process_transcripts(evaluated_tensors[2],
                                          evaluated_tensors[3], vocab)
    cer = word_error_rate(hypotheses=greedy_hypotheses,
                          references=references,
                          use_cer=True)
    logging.info("Greedy CER {:.2f}%".format(cer * 100))

    if args.lm_path:
        beam_hypotheses = []
        # Over mini-batch
        for i in evaluated_tensors[-1]:
            # Over samples
            for j in i:
                beam_hypotheses.append(j[0][1])

        cer = word_error_rate(hypotheses=beam_hypotheses,
                              references=references,
                              use_cer=True)
        logging.info("Beam CER {:.2f}".format(cer * 100))

    if args.save_logprob:
        # Convert logits to list of numpy arrays
        logprob = []
        for i, batch in enumerate(evaluated_tensors[0]):
            for j in range(batch.shape[0]):
                logprob.append(
                    batch[j][:evaluated_tensors[4][i][j], :].cpu().numpy())
        with open(args.save_logprob, 'wb') as f:
            pickle.dump(logprob, f, protocol=pickle.HIGHEST_PROTOCOL)
Exemple #19
0
def main(
    config_file,
    nn_encoder,
    nn_decoder,
    nn_onnx_encoder,
    nn_onnx_decoder,
    pre_v09_model=False,
    batch_size=1,
    time_steps=256,
):
    yaml = YAML(typ="safe")

    logging.info("Loading config file...")
    with open(config_file) as f:
        jasper_model_definition = yaml.load(f)

    logging.info("Determining model shape...")
    if 'AudioPreprocessing' in jasper_model_definition:
        num_encoder_input_features = jasper_model_definition['AudioPreprocessing']['features']
    elif 'AudioToMelSpectrogramPreprocessor' in jasper_model_definition:
        num_encoder_input_features = jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features']
    else:
        num_encoder_input_features = 64
    num_decoder_input_features = jasper_model_definition['JasperEncoder']['jasper'][-1]['filters']
    logging.info("  Num encoder input features: {}".format(num_encoder_input_features))
    logging.info("  Num decoder input features: {}".format(num_decoder_input_features))

    nf = nemo.core.NeuralModuleFactory(create_tb_writer=False)

    logging.info("Initializing models...")
    jasper_encoder = nemo_asr.JasperEncoder(
        feat_in=num_encoder_input_features, **jasper_model_definition['JasperEncoder']
    )

    jasper_decoder = nemo_asr.JasperDecoderForCTC(
        feat_in=num_decoder_input_features, num_classes=len(jasper_model_definition['labels']),
    )

    # This is necessary if you are using checkpoints trained with NeMo
    # version before 0.9
    logging.info("Loading checkpoints...")
    if pre_v09_model:
        logging.info("  Converting pre v0.9 checkpoint...")
        ckpt = torch.load(nn_encoder)
        new_ckpt = {}
        for k, v in ckpt.items():
            new_k = k.replace('.conv.', '.mconv.')
            if len(v.shape) == 3:
                new_k = new_k.replace('.weight', '.conv.weight')
            new_ckpt[new_k] = v
        jasper_encoder.load_state_dict(new_ckpt)
    else:
        jasper_encoder.restore_from(nn_encoder)
    jasper_decoder.restore_from(nn_decoder)

    logging.info("Exporting encoder...")
    nf.deployment_export(
        jasper_encoder,
        nn_onnx_encoder,
        nemo.core.neural_factory.DeploymentFormat.ONNX,
        torch.zeros(batch_size, num_encoder_input_features, time_steps, dtype=torch.float, device="cuda:0",),
    )
    logging.info("Exporting decoder...")
    nf.deployment_export(
        jasper_decoder,
        nn_onnx_decoder,
        nemo.core.neural_factory.DeploymentFormat.ONNX,
        (torch.zeros(batch_size, num_decoder_input_features, time_steps // 2, dtype=torch.float, device="cuda:0",)),
    )
    logging.info("Export completed successfully.")
    def test_stft_conv_training(self):
        """Integtaion test that instantiates a small Jasper model and tests training with the sample asr data.
        test_stft_conv_training tests the torch_stft path while test_jasper_training tests the torch.stft path inside
        of AudioToMelSpectrogramPreprocessor.
        Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss
        at the first step.
        Note: Training is done with batch gradient descent as opposed to stochastic gradient descent due to CTC loss
        """
        with open(
                os.path.abspath(
                    os.path.join(os.path.dirname(__file__),
                                 "../data/jasper_smaller.yaml"))) as file:
            jasper_model_definition = self.yaml.load(file)
        dl = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=30)
        pre_process_params = {
            'frame_splicing': 1,
            'features': 64,
            'window_size': 0.02,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
            'stft_conv': True,
        }
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            **pre_process_params)
        jasper_encoder = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition[
                'AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition['JasperEncoder'],
        )
        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024,
                                                      num_classes=len(
                                                          self.labels))

        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))

        # DAG
        audio_signal, a_sig_length, transcript, transcript_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal,
                                                   length=a_sig_length)

        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal,
                                              length=p_length)
        # logging.info(jasper_encoder)
        log_probs = jasper_decoder(encoder_output=encoded)
        loss = ctc_loss(
            log_probs=log_probs,
            targets=transcript,
            input_length=encoded_len,
            target_length=transcript_len,
        )

        loss_list = []
        callback = SimpleLossLoggerCallback(tensors=[loss],
                                            print_func=partial(
                                                self.print_and_log_loss,
                                                loss_log_list=loss_list),
                                            step_freq=1)

        self.nf.train(
            [loss],
            callbacks=[callback],
            optimizer="sgd",
            optimization_params={
                "max_steps": 3,
                "lr": 0.001
            },
        )
        self.nf.reset_trainer()

        # Assert that training loss went down
        assert loss_list[-1] < loss_list[0]
Exemple #21
0
def create_all_dags(args, neural_factory):
    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)
    vocab = jasper_params["labels"]
    sample_rate = jasper_params["sample_rate"]

    # Calculate num_workers for dataloader
    total_cpus = os.cpu_count()
    cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1)
    # perturb_config = jasper_params.get('perturb', None)
    train_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    train_dl_params.update(jasper_params["AudioToTextDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]
    # del train_dl_params["normalize_transcripts"]

    if args.dataset:
        d_path = Path(args.dataset)
        if not args.train_dataset:
            args.train_dataset = str(d_path / Path("train_manifest.json"))
        if not args.eval_datasets:
            args.eval_datasets = [str(d_path / Path("test_manifest.json"))]

    data_loader_layer = nemo_asr.AudioToTextDataLayer

    if args.remote_data:
        train_dl_params["rpyc_host"] = args.remote_data
        data_loader_layer = RpycAudioToTextDataLayer

    # data_layer = data_loader_layer(
    #     manifest_filepath=args.train_dataset,
    #     sample_rate=sample_rate,
    #     labels=vocab,
    #     batch_size=args.batch_size,
    #     num_workers=cpu_per_traindl,
    #     **train_dl_params,
    #     # normalize_transcripts=False
    # )
    #
    # N = len(data_layer)
    # steps_per_epoch = math.ceil(
    #     N / (args.batch_size * args.iter_per_step * args.num_gpus)
    # )
    # logging.info("Have {0} examples to train on.".format(N))
    #
    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        sample_rate=sample_rate,
        **jasper_params["AudioToMelSpectrogramPreprocessor"])

    # multiply_batch_config = jasper_params.get("MultiplyBatch", None)
    # if multiply_batch_config:
    #     multiply_batch = nemo_asr.MultiplyBatch(**multiply_batch_config)
    #
    # spectr_augment_config = jasper_params.get("SpectrogramAugmentation", None)
    # if spectr_augment_config:
    #     data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(
    #         **spectr_augment_config
    #     )
    #
    eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"])
    if args.remote_data:
        eval_dl_params["rpyc_host"] = args.remote_data
    del eval_dl_params["train"]
    del eval_dl_params["eval"]
    data_layers_eval = []

    # if args.eval_datasets:
    for eval_datasets in args.eval_datasets:
        data_layer_eval = data_loader_layer(
            manifest_filepath=eval_datasets,
            sample_rate=sample_rate,
            labels=vocab,
            batch_size=args.eval_batch_size,
            num_workers=cpu_per_traindl,
            **eval_dl_params,
        )

        data_layers_eval.append(data_layer_eval)
    # else:
    #     logging.warning("There were no val datasets passed")

    jasper_encoder = nemo_asr.JasperEncoder(
        feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"],
        **jasper_params["JasperEncoder"],
    )
    jasper_encoder.restore_from(args.encoder_checkpoint, local_rank=0)

    jasper_decoder = nemo_asr.JasperDecoderForCTC(
        feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"],
        num_classes=len(vocab),
    )
    jasper_decoder.restore_from(args.decoder_checkpoint, local_rank=0)

    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))

    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    # logging.info("================================")
    # logging.info(f"Number of parameters in encoder: {jasper_encoder.num_weights}")
    # logging.info(f"Number of parameters in decoder: {jasper_decoder.num_weights}")
    # logging.info(
    #     f"Total number of parameters in model: "
    #     f"{jasper_decoder.num_weights + jasper_encoder.num_weights}"
    # )
    # logging.info("================================")
    #
    # # Train DAG
    # (audio_signal_t, a_sig_length_t, transcript_t, transcript_len_t) = data_layer()
    # processed_signal_t, p_length_t = data_preprocessor(
    #     input_signal=audio_signal_t, length=a_sig_length_t
    # )
    #
    # if multiply_batch_config:
    #     (
    #         processed_signal_t,
    #         p_length_t,
    #         transcript_t,
    #         transcript_len_t,
    #     ) = multiply_batch(
    #         in_x=processed_signal_t,
    #         in_x_len=p_length_t,
    #         in_y=transcript_t,
    #         in_y_len=transcript_len_t,
    #     )
    #
    # if spectr_augment_config:
    #     processed_signal_t = data_spectr_augmentation(input_spec=processed_signal_t)
    #
    # encoded_t, encoded_len_t = jasper_encoder(
    #     audio_signal=processed_signal_t, length=p_length_t
    # )
    # log_probs_t = jasper_decoder(encoder_output=encoded_t)
    # predictions_t = greedy_decoder(log_probs=log_probs_t)
    # loss_t = ctc_loss(
    #     log_probs=log_probs_t,
    #     targets=transcript_t,
    #     input_length=encoded_len_t,
    #     target_length=transcript_len_t,
    # )
    #
    # # Callbacks needed to print info to console and Tensorboard
    # train_callback = nemo.core.SimpleLossLoggerCallback(
    #     tensors=[loss_t, predictions_t, transcript_t, transcript_len_t],
    #     print_func=partial(monitor_asr_train_progress, labels=vocab),
    #     get_tb_values=lambda x: [("loss", x[0])],
    #     tb_writer=neural_factory.tb_writer,
    # )
    #
    # chpt_callback = nemo.core.CheckpointCallback(
    #     folder=neural_factory.checkpoint_dir,
    #     load_from_folder=args.load_dir,
    #     step_freq=args.checkpoint_save_freq,
    #     checkpoints_to_keep=30,
    # )
    #
    # callbacks = [train_callback, chpt_callback]
    callbacks = []
    # assemble eval DAGs
    for i, eval_dl in enumerate(data_layers_eval):
        (audio_signal_e, a_sig_length_e, transcript_e,
         transcript_len_e) = eval_dl()
        processed_signal_e, p_length_e = data_preprocessor(
            input_signal=audio_signal_e, length=a_sig_length_e)
        encoded_e, encoded_len_e = jasper_encoder(
            audio_signal=processed_signal_e, length=p_length_e)
        log_probs_e = jasper_decoder(encoder_output=encoded_e)
        predictions_e = greedy_decoder(log_probs=log_probs_e)
        loss_e = ctc_loss(
            log_probs=log_probs_e,
            targets=transcript_e,
            input_length=encoded_len_e,
            target_length=transcript_len_e,
        )

        # create corresponding eval callback
        tagname = os.path.basename(args.eval_datasets[i]).split(".")[0]
        eval_callback = nemo.core.EvaluatorCallback(
            eval_tensors=[
                loss_e, predictions_e, transcript_e, transcript_len_e
            ],
            user_iter_callback=partial(process_evaluation_batch, labels=vocab),
            user_epochs_done_callback=partial(process_evaluation_epoch,
                                              tag=tagname),
            eval_step=args.eval_freq,
            tb_writer=neural_factory.tb_writer,
        )

        callbacks.append(eval_callback)
    return callbacks
Exemple #22
0
    def test_double_jasper_training(self):
        with open(
                os.path.abspath(
                    os.path.join(os.path.dirname(__file__),
                                 "../data/jasper_smaller.yaml"))) as file:
            jasper_model_definition = self.yaml.load(file)
        dl = nemo_asr.AudioToTextDataLayer(
            featurizer_config=self.featurizer_config,
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=4,
        )
        pre_process_params = {
            'int_values': False,
            'frame_splicing': 1,
            'features': 64,
            'window_size': 0.02,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
        }
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            **pre_process_params)
        jasper_encoder1 = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition[
                'AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition['JasperEncoder'],
        )
        jasper_encoder2 = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition[
                'AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition['JasperEncoder'],
        )
        mx_max1 = nemo.backends.pytorch.common.SimpleCombiner(mode="max")
        mx_max2 = nemo.backends.pytorch.common.SimpleCombiner(mode="max")
        jasper_decoder1 = nemo_asr.JasperDecoderForCTC(feat_in=1024,
                                                       num_classes=len(
                                                           self.labels))
        jasper_decoder2 = nemo_asr.JasperDecoderForCTC(feat_in=1024,
                                                       num_classes=len(
                                                           self.labels))

        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))

        # DAG
        audio_signal, a_sig_length, transcript, transcript_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal,
                                                   length=a_sig_length)

        encoded1, encoded_len1 = jasper_encoder1(audio_signal=processed_signal,
                                                 length=p_length)
        encoded2, encoded_len2 = jasper_encoder2(audio_signal=processed_signal,
                                                 length=p_length)
        log_probs1 = jasper_decoder1(encoder_output=encoded1)
        log_probs2 = jasper_decoder2(encoder_output=encoded2)
        log_probs = mx_max1(x1=log_probs1, x2=log_probs2)
        encoded_len = mx_max2(x1=encoded_len1, x2=encoded_len2)
        loss = ctc_loss(
            log_probs=log_probs,
            targets=transcript,
            input_length=encoded_len,
            target_length=transcript_len,
        )

        callback = nemo.core.SimpleLossLoggerCallback(
            tensors=[loss],
            print_func=lambda x: logging.info(str(x[0].item())))
        # Instantiate an optimizer to perform `train` action
        neural_factory = nemo.core.NeuralModuleFactory(
            backend=nemo.core.Backend.PyTorch,
            local_rank=None,
            create_tb_writer=False,
        )
        optimizer = neural_factory.get_trainer()
        optimizer.train(
            [loss],
            callbacks=[callback],
            optimizer="sgd",
            optimization_params={
                "num_epochs": 10,
                "lr": 0.0003
            },
        )