Beispiel #1
0
def create_all_dags(args, neural_factory):
    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)
    vocab = load_vocab(args.vocab_file)
    sample_rate = jasper_params['sample_rate']

    # Calculate num_workers for dataloader
    total_cpus = os.cpu_count()
    cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1)

    # perturb_config = jasper_params.get('perturb', None)
    train_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    train_dl_params.update(jasper_params["AudioToTextDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]
    train_dl_params["normalize_transcripts"] = False
    data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.train_dataset,
        sample_rate=sample_rate,
        labels=vocab,
        batch_size=args.batch_size,
        num_workers=cpu_per_traindl,
        **train_dl_params,
        # normalize_transcripts=False
    )

    N = len(data_layer)
    steps_per_epoch = int(N / (args.batch_size * args.num_gpus))
    nemo.logging.info('Have {0} examples to train on.'.format(N))

    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        sample_rate=sample_rate, **jasper_params["AudioToMelSpectrogramPreprocessor"],
    )

    multiply_batch_config = jasper_params.get('MultiplyBatch', None)
    if multiply_batch_config:
        multiply_batch = nemo_asr.MultiplyBatch(**multiply_batch_config)

    spectr_augment_config = jasper_params.get('SpectrogramAugmentation', None)
    if spectr_augment_config:
        data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(**spectr_augment_config)

    eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"])
    eval_dl_params["normalize_transcripts"] = False
    del eval_dl_params["train"]
    del eval_dl_params["eval"]
    data_layers_eval = []

    if args.eval_datasets:
        for eval_datasets in args.eval_datasets:
            data_layer_eval = nemo_asr.AudioToTextDataLayer(
                manifest_filepath=eval_datasets,
                sample_rate=sample_rate,
                labels=vocab,
                batch_size=args.eval_batch_size,
                num_workers=cpu_per_traindl,
                **eval_dl_params,
            )

            data_layers_eval.append(data_layer_eval)
    else:
        nemo.logging.warning("There were no val datasets passed")

    jasper_encoder = nemo_asr.JasperEncoder(
        feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"], **jasper_params["JasperEncoder"],
    )

    jasper_decoder = nemo_asr.JasperDecoderForCTC(
        feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"],
        num_classes=len(vocab),
        factory=neural_factory,
    )

    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))

    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    nemo.logging.info('================================')
    nemo.logging.info(f"Number of parameters in encoder: {jasper_encoder.num_weights}")
    nemo.logging.info(f"Number of parameters in decoder: {jasper_decoder.num_weights}")
    nemo.logging.info(
        f"Total number of parameters in model: " f"{jasper_decoder.num_weights + jasper_encoder.num_weights}"
    )
    nemo.logging.info('================================')

    # Train DAG
    (audio_signal_t, a_sig_length_t, transcript_t, transcript_len_t,) = data_layer()
    processed_signal_t, p_length_t = data_preprocessor(input_signal=audio_signal_t, length=a_sig_length_t)

    if multiply_batch_config:
        (processed_signal_t, p_length_t, transcript_t, transcript_len_t,) = multiply_batch(
            in_x=processed_signal_t, in_x_len=p_length_t, in_y=transcript_t, in_y_len=transcript_len_t,
        )

    if spectr_augment_config:
        processed_signal_t = data_spectr_augmentation(input_spec=processed_signal_t)

    encoded_t, encoded_len_t = jasper_encoder(audio_signal=processed_signal_t, length=p_length_t)
    log_probs_t = jasper_decoder(encoder_output=encoded_t)
    predictions_t = greedy_decoder(log_probs=log_probs_t)
    loss_t = ctc_loss(
        log_probs=log_probs_t, targets=transcript_t, input_length=encoded_len_t, target_length=transcript_len_t,
    )

    # Callbacks needed to print info to console and Tensorboard
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[loss_t, predictions_t, transcript_t, transcript_len_t],
        print_func=partial(monitor_asr_train_progress, labels=vocab, eval_metric='CER'),
        step_freq=args.train_eval_freq,
        get_tb_values=lambda x: [("loss", x[0])],
        tb_writer=neural_factory.tb_writer,
    )

    chpt_callback = nemo.core.CheckpointCallback(
        folder=neural_factory.checkpoint_dir, step_freq=args.checkpoint_save_freq,
    )

    callbacks = [train_callback, chpt_callback]

    # assemble eval DAGs
    for i, eval_dl in enumerate(data_layers_eval):
        (audio_signal_e, a_sig_length_e, transcript_e, transcript_len_e,) = eval_dl()
        processed_signal_e, p_length_e = data_preprocessor(input_signal=audio_signal_e, length=a_sig_length_e)
        encoded_e, encoded_len_e = jasper_encoder(audio_signal=processed_signal_e, length=p_length_e)
        log_probs_e = jasper_decoder(encoder_output=encoded_e)
        predictions_e = greedy_decoder(log_probs=log_probs_e)
        loss_e = ctc_loss(
            log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e,
        )

        # create corresponding eval callback
        tagname = os.path.basename(args.eval_datasets[i]).split(".")[0]
        eval_callback = nemo.core.EvaluatorCallback(
            eval_tensors=[loss_e, predictions_e, transcript_e, transcript_len_e,],
            user_iter_callback=partial(process_evaluation_batch, labels=vocab),
            user_epochs_done_callback=partial(process_evaluation_epoch, eval_metric='CER', tag=tagname),
            eval_step=args.eval_freq,
            tb_writer=neural_factory.tb_writer,
        )

        callbacks.append(eval_callback)
    return loss_t, callbacks, steps_per_epoch
Beispiel #2
0
yaml = YAML(typ="safe")
with open("./config.yaml") as f:
    quartznet_model_definition = yaml.load(f)
labels = quartznet_model_definition['labels']
print(labels)
# Instantiate neural modules
data_layer = nemo_asr.AudioToTextDataLayer(manifest_filepath=train_dataset,
                                           labels=labels,
                                           batch_size=32)
data_layer_val = nemo_asr.AudioToTextDataLayer(manifest_filepath=eval_datasets,
                                               labels=labels,
                                               batch_size=32,
                                               shuffle=False)

data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor()
spec_augment = nemo_asr.SpectrogramAugmentation(rect_masks=5)

encoder = nemo_asr.JasperEncoder(feat_in=64,
                                 **quartznet_model_definition['JasperEncoder'])
decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(labels))
ctc_loss = nemo_asr.CTCLossNM(num_classes=len(labels))
greedy_decoder = nemo_asr.GreedyCTCDecoder()

# Training DAG (Model)
audio_signal, audio_signal_len, transcript, transcript_len = data_layer()
processed_signal, processed_signal_len = data_preprocessor(
    input_signal=audio_signal, length=audio_signal_len)
aug_signal = spec_augment(input_spec=processed_signal)
encoded, encoded_len = encoder(audio_signal=aug_signal,
                               length=processed_signal_len)
log_probs = decoder(encoder_output=encoded)
def create_dag(args, cfg, logger, num_gpus):
    # Defining nodes
    data = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.train_dataset,
        labels=cfg['target']['labels'],
        batch_size=cfg['optimization']['batch_size'],
        eos_id=cfg['target']['eos_id'],
        **cfg['AudioToTextDataLayer']['train'],
    )
    data_evals = []
    if args.eval_datasets:
        for val_path in args.eval_datasets:
            data_evals.append(
                nemo_asr.AudioToTextDataLayer(
                    manifest_filepath=val_path,
                    labels=cfg['target']['labels'],
                    batch_size=cfg['inference']['batch_size'],
                    eos_id=cfg['target']['eos_id'],
                    **cfg['AudioToTextDataLayer']['eval'],
                ))
    else:
        logger.info("There were no val datasets passed")
    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        **cfg['AudioToMelSpectrogramPreprocessor'])
    data_augmentation = nemo_asr.SpectrogramAugmentation(
        **cfg['SpectrogramAugmentation'])
    encoder = nemo_asr.JasperEncoder(
        feat_in=cfg["AudioToMelSpectrogramPreprocessor"]["features"],
        **cfg['JasperEncoder'],
    )
    if args.encoder_checkpoint is not None and os.path.exists(
            args.encoder_checkpoint):
        if cfg['JasperEncoder']['load']:
            encoder.restore_from(args.encoder_checkpoint, args.local_rank)
            logger.info(f'Loaded weights for encoder'
                        f' from {args.encoder_checkpoint}')
        if cfg['JasperEncoder']['freeze']:
            encoder.freeze()
            logger.info(f'Freeze encoder weights')
    connector = nemo_asr.JasperRNNConnector(
        in_channels=cfg['JasperEncoder']['jasper'][-1]['filters'],
        out_channels=cfg['DecoderRNN']['hidden_size'],
    )
    decoder = nemo.backends.pytorch.DecoderRNN(
        voc_size=len(cfg['target']['labels']),
        bos_id=cfg['target']['bos_id'],
        **cfg['DecoderRNN'],
    )
    if args.decoder_checkpoint is not None and os.path.exists(
            args.decoder_checkpoint):
        if cfg['DecoderRNN']['load']:
            decoder.restore_from(args.decoder_checkpoint, args.local_rank)
            logger.info(f'Loaded weights for decoder'
                        f' from {args.decoder_checkpoint}')
        if cfg['DecoderRNN']['freeze']:
            decoder.freeze()
            logger.info(f'Freeze decoder weights')
            if cfg['decoder']['unfreeze_attn']:
                for name, param in decoder.attention.named_parameters():
                    param.requires_grad = True
                logger.info(f'Unfreeze decoder attn weights')
    num_data = len(data)
    batch_size = cfg['optimization']['batch_size']
    num_epochs = cfg['optimization']['params']['num_epochs']
    steps_per_epoch = int(num_data / (batch_size * num_gpus))
    total_steps = num_epochs * steps_per_epoch
    vsc = ValueSetterCallback
    tf_callback = ValueSetterCallback(
        decoder,
        'teacher_forcing',
        policies=[vsc.Policy(vsc.Method.Const(1.0), start=0.0, end=1.0)],
        total_steps=total_steps,
    )
    seq_loss = nemo.backends.pytorch.SequenceLoss(
        pad_id=cfg['target']['pad_id'],
        smoothing_coef=cfg['optimization']['smoothing_coef'],
        sample_wise=cfg['optimization']['sample_wise'],
    )
    se_callback = ValueSetterCallback(
        seq_loss,
        'smoothing_coef',
        policies=[
            vsc.Policy(vsc.Method.Const(seq_loss.smoothing_coef),
                       start=0.0,
                       end=1.0),
        ],
        total_steps=total_steps,
    )
    beam_search = nemo.backends.pytorch.BeamSearch(
        decoder=decoder,
        pad_id=cfg['target']['pad_id'],
        bos_id=cfg['target']['bos_id'],
        eos_id=cfg['target']['eos_id'],
        max_len=cfg['target']['max_len'],
        beam_size=cfg['inference']['beam_size'],
    )
    uf_callback = UnfreezeCallback(
        [encoder, decoder], start_epoch=cfg['optimization']['start_unfreeze'])
    saver_callback = nemo.core.ModuleSaverCallback(
        save_modules_list=[encoder, connector, decoder],
        folder=args.checkpoint_dir,
        step_freq=args.eval_freq,
    )

    # Creating DAG
    audios, audio_lens, transcripts, _ = data()
    processed_audios, processed_audio_lens = data_preprocessor(
        input_signal=audios, length=audio_lens)
    augmented_spec = data_augmentation(input_spec=processed_audios)
    encoded, _ = encoder(audio_signal=augmented_spec,
                         length=processed_audio_lens)
    encoded = connector(tensor=encoded)
    log_probs, _ = decoder(targets=transcripts, encoder_outputs=encoded)
    train_loss = seq_loss(log_probs=log_probs, targets=transcripts)
    evals = []
    for i, data_eval in enumerate(data_evals):
        audios, audio_lens, transcripts, _ = data_eval()
        processed_audios, processed_audio_lens = data_preprocessor(
            input_signal=audios, length=audio_lens)
        encoded, _ = encoder(audio_signal=processed_audios,
                             length=processed_audio_lens)
        encoded = connector(tensor=encoded)
        log_probs, _ = decoder(targets=transcripts, encoder_outputs=encoded)
        loss = seq_loss(log_probs=log_probs, targets=transcripts)
        predictions, aw = beam_search(encoder_outputs=encoded)
        evals.append((
            args.eval_datasets[i],
            (loss, log_probs, transcripts, predictions, aw),
        ))

    # Update config
    cfg['num_params'] = {
        'encoder': encoder.num_weights,
        'connector': connector.num_weights,
        'decoder': decoder.num_weights,
    }
    cfg['num_params']['total'] = sum(cfg['num_params'].values())
    cfg['input']['train'] = {'num_data': num_data}
    cfg['optimization']['steps_per_epoch'] = steps_per_epoch
    cfg['optimization']['total_steps'] = total_steps

    return (
        (train_loss, evals),
        cfg,
        [tf_callback, se_callback, uf_callback, saver_callback],
    )
Beispiel #4
0
    def test_simple_dags(self):
        # module instantiation
        with open("tests/data/jasper_smaller.yaml") as file:
            jasper_model_definition = self.yaml.load(file)
        labels = jasper_model_definition['labels']

        data_layer = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=labels,
            batch_size=4,
        )
        data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
            **jasper_model_definition['AudioToMelSpectrogramPreprocessor'])
        jasper_encoder = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition[
                'AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition['JasperEncoder'],
        )
        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024,
                                                      num_classes=len(labels))
        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(labels))
        greedy_decoder = nemo_asr.GreedyCTCDecoder()

        # DAG definition
        (
            audio_signal,
            audio_signal_len,
            transcript,
            transcript_len,
        ) = data_layer()
        processed_signal, processed_signal_len = data_preprocessor(
            input_signal=audio_signal, length=audio_signal_len)

        spec_augment = nemo_asr.SpectrogramAugmentation(rect_masks=5)
        aug_signal = spec_augment(input_spec=processed_signal)

        encoded, encoded_len = jasper_encoder(audio_signal=aug_signal,
                                              length=processed_signal_len)
        log_probs = jasper_decoder(encoder_output=encoded)
        predictions = greedy_decoder(log_probs=log_probs)
        loss = ctc_loss(
            log_probs=log_probs,
            targets=transcript,
            input_length=encoded_len,
            target_length=transcript_len,
        )

        def wrong():
            with open("tests/data/jasper_smaller.yaml") as file:
                jasper_config = self.yaml.load(file)
            labels = jasper_config['labels']

            data_layer = nemo_asr.AudioToTextDataLayer(
                manifest_filepath=self.manifest_filepath,
                labels=labels,
                batch_size=4,
            )
            data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
                **jasper_config['AudioToMelSpectrogramPreprocessor'])
            jasper_encoder = nemo_asr.JasperEncoder(
                feat_in=jasper_config['AudioToMelSpectrogramPreprocessor']
                ['features'],
                **jasper_config['JasperEncoder'],
            )
            jasper_decoder = nemo_asr.JasperDecoderForCTC(
                feat_in=1024, num_classes=len(labels))
            # DAG definition
            (
                audio_signal,
                audio_signal_len,
                transcript,
                transcript_len,
            ) = data_layer()
            processed_signal, processed_signal_len = data_preprocessor(
                input_signal=audio_signal, length=audio_signal_len)

            spec_augment = nemo_asr.SpectrogramAugmentation(rect_masks=5)
            aug_signal = spec_augment(input_spec=processed_signal)

            encoded, encoded_len = jasper_encoder(audio_signal=aug_signal,
                                                  length=processed_signal_len)
            log_probs = jasper_decoder(encoder_output=processed_signal)

        self.assertRaises(NeuralPortNmTensorMismatchError, wrong)
Beispiel #5
0
def create_all_dags(args, neural_factory):
    '''
    creates train and eval dags as well as their callbacks
    returns train loss tensor and callbacks'''

    # parse the config files
    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        quartz_params = yaml.load(f)

    vocab = quartz_params['labels']
    sample_rate = quartz_params['sample_rate']

    # Calculate num_workers for dataloader
    total_cpus = os.cpu_count()
    cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1)

    # create data layer for training
    train_dl_params = copy.deepcopy(quartz_params["AudioToTextDataLayer"])
    train_dl_params.update(quartz_params["AudioToTextDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]
    # del train_dl_params["normalize_transcripts"]

    data_layer_train = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.train_dataset,
        sample_rate=sample_rate,
        labels=vocab,
        batch_size=args.batch_size,
        num_workers=cpu_per_traindl,
        **train_dl_params,
        # normalize_transcripts=False
    )

    N = len(data_layer_train)
    steps_per_epoch = int(
        N / (args.batch_size * args.iter_per_step * args.num_gpus))

    # create separate data layers for eval
    # we need separate eval dags for separate eval datasets
    # but all other modules in these dags will be shared

    eval_dl_params = copy.deepcopy(quartz_params["AudioToTextDataLayer"])
    eval_dl_params.update(quartz_params["AudioToTextDataLayer"]["eval"])
    del eval_dl_params["train"]
    del eval_dl_params["eval"]

    data_layers_eval = []
    if args.eval_datasets:
        for eval_dataset in args.eval_datasets:
            data_layer_eval = nemo_asr.AudioToTextDataLayer(
                manifest_filepath=eval_dataset,
                sample_rate=sample_rate,
                labels=vocab,
                batch_size=args.eval_batch_size,
                num_workers=cpu_per_traindl,
                **eval_dl_params,
            )

            data_layers_eval.append(data_layer_eval)
    else:
        nemo.logging.warning("There were no val datasets passed")

    # create shared modules

    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        sample_rate=sample_rate,
        **quartz_params["AudioToMelSpectrogramPreprocessor"],
    )

    # (QuartzNet uses the Jasper baseline encoder and decoder)
    encoder = nemo_asr.JasperEncoder(
        feat_in=quartz_params["AudioToMelSpectrogramPreprocessor"]["features"],
        **quartz_params["JasperEncoder"],
    )

    decoder = nemo_asr.JasperDecoderForCTC(
        feat_in=quartz_params["JasperEncoder"]["jasper"][-1]["filters"],
        num_classes=len(vocab),
    )

    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))

    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    # create augmentation modules (only used for training) if their configs
    # are present

    multiply_batch_config = quartz_params.get('MultiplyBatch', None)
    if multiply_batch_config:
        multiply_batch = nemo_asr.MultiplyBatch(**multiply_batch_config)

    spectr_augment_config = quartz_params.get('SpectrogramAugmentation', None)
    if spectr_augment_config:
        data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(
            **spectr_augment_config)

    # assemble train DAG

    (
        audio_signal_t,
        a_sig_length_t,
        transcript_t,
        transcript_len_t,
    ) = data_layer_train()

    processed_signal_t, p_length_t = data_preprocessor(
        input_signal=audio_signal_t, length=a_sig_length_t)

    if multiply_batch_config:
        (
            processed_signal_t,
            p_length_t,
            transcript_t,
            transcript_len_t,
        ) = multiply_batch(
            in_x=processed_signal_t,
            in_x_len=p_length_t,
            in_y=transcript_t,
            in_y_len=transcript_len_t,
        )

    if spectr_augment_config:
        processed_signal_t = data_spectr_augmentation(
            input_spec=processed_signal_t)

    encoded_t, encoded_len_t = encoder(audio_signal=processed_signal_t,
                                       length=p_length_t)
    log_probs_t = decoder(encoder_output=encoded_t)
    predictions_t = greedy_decoder(log_probs=log_probs_t)
    loss_t = ctc_loss(
        log_probs=log_probs_t,
        targets=transcript_t,
        input_length=encoded_len_t,
        target_length=transcript_len_t,
    )

    # create train callbacks
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[loss_t, predictions_t, transcript_t, transcript_len_t],
        print_func=partial(monitor_asr_train_progress, labels=vocab),
        get_tb_values=lambda x: [["loss", x[0]]],
        tb_writer=neural_factory.tb_writer,
    )

    callbacks = [train_callback]

    if args.checkpoint_dir or args.load_dir:
        chpt_callback = nemo.core.CheckpointCallback(
            folder=args.checkpoint_dir,
            load_from_folder=args.load_dir,
            step_freq=args.checkpoint_save_freq,
        )

        callbacks.append(chpt_callback)

    # assemble eval DAGs
    for i, eval_dl in enumerate(data_layers_eval):
        (
            audio_signal_e,
            a_sig_length_e,
            transcript_e,
            transcript_len_e,
        ) = eval_dl()
        processed_signal_e, p_length_e = data_preprocessor(
            input_signal=audio_signal_e, length=a_sig_length_e)
        encoded_e, encoded_len_e = encoder(audio_signal=processed_signal_e,
                                           length=p_length_e)
        log_probs_e = decoder(encoder_output=encoded_e)
        predictions_e = greedy_decoder(log_probs=log_probs_e)
        loss_e = ctc_loss(
            log_probs=log_probs_e,
            targets=transcript_e,
            input_length=encoded_len_e,
            target_length=transcript_len_e,
        )

        # create corresponding eval callback
        tagname = os.path.basename(args.eval_datasets[i]).split(".")[0]

        eval_callback = nemo.core.EvaluatorCallback(
            eval_tensors=[
                loss_e,
                predictions_e,
                transcript_e,
                transcript_len_e,
            ],
            user_iter_callback=partial(process_evaluation_batch, labels=vocab),
            user_epochs_done_callback=partial(process_evaluation_epoch,
                                              tag=tagname),
            eval_step=args.eval_freq,
            tb_writer=neural_factory.tb_writer,
        )

        callbacks.append(eval_callback)

    return loss_t, callbacks, steps_per_epoch
Beispiel #6
0
def create_all_dags(args, neural_factory):
    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)

    labels = jasper_params['labels']  # Vocab of tokens
    sample_rate = jasper_params['sample_rate']
    preprocessor = jasper_params['preprocessor']

    # Calculate num_workers for dataloader
    total_cpus = os.cpu_count()
    cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1)

    # perturb_config = jasper_params.get('perturb', None)
    train_dl_params = copy.deepcopy(jasper_params["AudioToSpeechLabelDataLayer"])
    train_dl_params.update(jasper_params["AudioToSpeechLabelDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]
    # del train_dl_params["normalize_transcripts"]

    # Look for augmentations
    audio_augmentor = jasper_params.get('AudioAugmentor', None)

    data_layer = nemo_asr.AudioToSpeechLabelDataLayer(
        manifest_filepath=args.train_dataset,
        labels=labels,
        sample_rate=sample_rate,
        batch_size=args.batch_size,
        num_workers=cpu_per_traindl,
        augmentor=audio_augmentor,
        **train_dl_params,
    )

    crop_pad_augmentation = nemo_asr.CropOrPadSpectrogramAugmentation(audio_length=jasper_params['timesteps'])

    N = len(data_layer)
    steps_per_epoch = math.ceil(N / (args.batch_size * args.iter_per_step * args.num_gpus))
    logging.info('Steps per epoch : {0}'.format(steps_per_epoch))
    logging.info('Have {0} examples to train on.'.format(N))

    if preprocessor == "AudioToMFCCPreprocessor":
        data_preprocessor = nemo_asr.AudioToMFCCPreprocessor(sample_rate=sample_rate, **jasper_params[preprocessor])
    elif preprocessor == "AudioToMelSpectrogramPreprocessor":
        data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
            sample_rate=sample_rate, **jasper_params[preprocessor]
        )

    spectr_augment_config = jasper_params.get('SpectrogramAugmentation', None)
    if spectr_augment_config:
        data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(**spectr_augment_config)

    eval_dl_params = copy.deepcopy(jasper_params["AudioToSpeechLabelDataLayer"])
    eval_dl_params.update(jasper_params["AudioToSpeechLabelDataLayer"]["eval"])
    del eval_dl_params["train"]
    del eval_dl_params["eval"]
    data_layers_eval = []

    if args.eval_datasets:
        for eval_datasets in args.eval_datasets:
            data_layer_eval = nemo_asr.AudioToSpeechLabelDataLayer(
                manifest_filepath=eval_datasets,
                sample_rate=sample_rate,
                labels=labels,
                batch_size=args.eval_batch_size,
                num_workers=cpu_per_traindl,
                **eval_dl_params,
            )

            data_layers_eval.append(data_layer_eval)
    else:
        logging.warning("There were no val datasets passed")

    jasper_encoder = nemo_asr.JasperEncoder(**jasper_params["JasperEncoder"],)

    jasper_decoder = nemo_asr.JasperDecoderForClassification(
        feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"],
        num_classes=len(labels),
        **jasper_params['JasperDecoderForClassification'],
    )

    ce_loss = nemo_asr.CrossEntropyLossNM()

    logging.info('================================')
    logging.info(f"Number of parameters in encoder: {jasper_encoder.num_weights}")
    logging.info(f"Number of parameters in decoder: {jasper_decoder.num_weights}")
    logging.info(f"Total number of parameters in model: " f"{jasper_decoder.num_weights + jasper_encoder.num_weights}")
    logging.info('================================')

    # Train DAG
    # --- Assemble Training DAG --- #
    audio_signal, audio_signal_len, commands, command_len = data_layer()

    processed_signal, processed_signal_len = data_preprocessor(input_signal=audio_signal, length=audio_signal_len)

    processed_signal, processed_signal_len = crop_pad_augmentation(
        input_signal=processed_signal, length=audio_signal_len
    )

    if spectr_augment_config:
        processed_signal = data_spectr_augmentation(input_spec=processed_signal)

    encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=processed_signal_len)

    decoded = jasper_decoder(encoder_output=encoded)

    loss = ce_loss(logits=decoded, labels=commands)

    # Callbacks needed to print info to console and Tensorboard
    train_callback = nemo.core.SimpleLossLoggerCallback(
        # Notice that we pass in loss, predictions, and the labels (commands).
        # Of course we would like to see our training loss, but we need the
        # other arguments to calculate the accuracy.
        tensors=[loss, decoded, commands],
        # The print_func defines what gets printed.
        print_func=partial(monitor_classification_training_progress, eval_metric=None),
        get_tb_values=lambda x: [("loss", x[0])],
        tb_writer=neural_factory.tb_writer,
    )

    chpt_callback = nemo.core.CheckpointCallback(
        folder=neural_factory.checkpoint_dir, load_from_folder=args.load_dir, step_freq=args.checkpoint_save_freq,
    )

    callbacks = [train_callback, chpt_callback]

    # assemble eval DAGs
    for i, eval_dl in enumerate(data_layers_eval):
        # --- Assemble Training DAG --- #
        test_audio_signal, test_audio_signal_len, test_commands, test_command_len = eval_dl()

        test_processed_signal, test_processed_signal_len = data_preprocessor(
            input_signal=test_audio_signal, length=test_audio_signal_len
        )

        test_processed_signal, test_processed_signal_len = crop_pad_augmentation(
            input_signal=test_processed_signal, length=test_processed_signal_len
        )

        test_encoded, test_encoded_len = jasper_encoder(
            audio_signal=test_processed_signal, length=test_processed_signal_len
        )

        test_decoded = jasper_decoder(encoder_output=test_encoded)

        test_loss = ce_loss(logits=test_decoded, labels=test_commands)

        # create corresponding eval callback
        tagname = os.path.basename(args.eval_datasets[i]).split(".")[0]
        eval_callback = nemo.core.EvaluatorCallback(
            eval_tensors=[test_loss, test_decoded, test_commands],
            user_iter_callback=partial(process_classification_evaluation_batch, top_k=1),
            user_epochs_done_callback=partial(process_classification_evaluation_epoch, eval_metric=1, tag=tagname),
            eval_step=args.eval_freq,  # How often we evaluate the model on the test set
            tb_writer=neural_factory.tb_writer,
        )

        callbacks.append(eval_callback)
    return loss, callbacks, steps_per_epoch
Beispiel #7
0
def create_all_dags(args, neural_factory):
    """
    creates train and eval dags as well as their callbacks
    returns train loss tensor and callbacks"""

    # parse the config files
    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        spkr_params = yaml.load(f)

    sample_rate = spkr_params["sample_rate"]
    time_length = spkr_params.get("time_length", 8)
    logging.info("max time length considered is {} sec".format(time_length))

    # Calculate num_workers for dataloader
    total_cpus = os.cpu_count()
    cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1) // 2

    # create data layer for training
    train_dl_params = copy.deepcopy(spkr_params["AudioToSpeechLabelDataLayer"])
    train_dl_params.update(spkr_params["AudioToSpeechLabelDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]
    audio_augmentor = spkr_params.get("AudioAugmentor", None)
    # del train_dl_params["normalize_transcripts"]

    data_layer_train = nemo_asr.AudioToSpeechLabelDataLayer(
        manifest_filepath=args.train_dataset,
        labels=None,
        batch_size=args.batch_size,
        num_workers=cpu_per_traindl,
        augmentor=audio_augmentor,
        time_length=time_length,
        **train_dl_params,
        # normalize_transcripts=False
    )

    N = len(data_layer_train)
    steps_per_epoch = int(
        N / (args.batch_size * args.iter_per_step * args.num_gpus))

    logging.info("Number of steps per epoch {}".format(steps_per_epoch))
    # create separate data layers for eval
    # we need separate eval dags for separate eval datasets
    # but all other modules in these dags will be shared

    eval_dl_params = copy.deepcopy(spkr_params["AudioToSpeechLabelDataLayer"])
    eval_dl_params.update(spkr_params["AudioToSpeechLabelDataLayer"]["eval"])
    del eval_dl_params["train"]
    del eval_dl_params["eval"]

    data_layers_test = []
    for test_set in args.eval_datasets:

        data_layer_test = nemo_asr.AudioToSpeechLabelDataLayer(
            manifest_filepath=test_set,
            labels=data_layer_train.labels,
            batch_size=args.batch_size,
            num_workers=cpu_per_traindl,
            time_length=time_length,
            **eval_dl_params,
            # normalize_transcripts=False
        )
        data_layers_test.append(data_layer_test)
    # create shared modules

    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        sample_rate=sample_rate,
        **spkr_params["AudioToMelSpectrogramPreprocessor"],
    )

    spectr_augment_config = spkr_params.get("SpectrogramAugmentation", None)
    if spectr_augment_config:
        data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(
            **spectr_augment_config)
    # (QuartzNet uses the Jasper baseline encoder and decoder)
    encoder = nemo_asr.JasperEncoder(**spkr_params["JasperEncoder"], )

    decoder = nemo_asr.JasperDecoderForSpkrClass(
        feat_in=spkr_params["JasperEncoder"]["jasper"][-1]["filters"],
        num_classes=data_layer_train.num_classes,
        pool_mode=spkr_params["JasperDecoderForSpkrClass"]['pool_mode'],
        emb_sizes=spkr_params["JasperDecoderForSpkrClass"]["emb_sizes"].split(
            ","),
    )
    if os.path.exists(args.checkpoint_dir + "/JasperEncoder-STEP-100.pt"):
        encoder.restore_from(args.checkpoint_dir +
                             "/JasperEncoder-STEP-100.pt")
        logging.info("Pretrained Encoder loaded")

    weight = None
    xent_loss = nemo_asr.CrossEntropyLossNM(weight=weight)

    # assemble train DAG

    audio_signal, audio_signal_len, label, label_len = data_layer_train()

    processed_signal, processed_signal_len = data_preprocessor(
        input_signal=audio_signal, length=audio_signal_len)

    if spectr_augment_config:
        processed_signal = data_spectr_augmentation(
            input_spec=processed_signal)

    encoded, encoded_len = encoder(audio_signal=processed_signal,
                                   length=processed_signal_len)

    logits, _ = decoder(encoder_output=encoded)
    loss = xent_loss(logits=logits, labels=label)

    # create train callbacks
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[loss, logits, label],
        print_func=partial(monitor_classification_training_progress,
                           eval_metric=[1]),
        step_freq=args.print_freq,
        get_tb_values=lambda x: [("train_loss", x[0])],
        tb_writer=neural_factory.tb_writer,
    )

    callbacks = [train_callback]

    if args.checkpoint_dir or args.load_dir:
        chpt_callback = nemo.core.CheckpointCallback(
            folder=args.checkpoint_dir,
            load_from_folder=args.checkpoint_dir,  # load dir
            step_freq=args.checkpoint_save_freq,
            checkpoints_to_keep=125,
        )

        callbacks.append(chpt_callback)

    # --- Assemble Validation DAG --- #

    for i, eval_layer in enumerate(data_layers_test):

        audio_signal_test, audio_len_test, label_test, _ = eval_layer()
        processed_signal_test, processed_len_test = data_preprocessor(
            input_signal=audio_signal_test, length=audio_len_test)
        encoded_test, encoded_len_test = encoder(
            audio_signal=processed_signal_test, length=processed_len_test)
        logits_test, _ = decoder(encoder_output=encoded_test)
        loss_test = xent_loss(logits=logits_test, labels=label_test)

        tagname = os.path.dirname(
            args.eval_datasets[i]).split("/")[-1] + "_" + str(i)
        print(tagname)
        eval_callback = nemo.core.EvaluatorCallback(
            eval_tensors=[loss_test, logits_test, label_test],
            user_iter_callback=partial(process_classification_evaluation_batch,
                                       top_k=1),
            user_epochs_done_callback=partial(
                process_classification_evaluation_epoch, tag=tagname),
            eval_step=args.
            eval_freq,  # How often we evaluate the model on the test set
            tb_writer=neural_factory.tb_writer,
        )

        callbacks.append(eval_callback)

    return loss, callbacks, steps_per_epoch, loss_test, logits_test, label_test