Exemple #1
0
def main():
    args, name = parse_args()

    log_dir = name
    if args.work_dir:
        log_dir = os.path.join(args.work_dir, name)

    # instantiate Neural Factory with supported backend
    neural_factory = nemo.core.NeuralModuleFactory(
        backend=nemo.core.Backend.PyTorch,
        local_rank=args.local_rank,
        optimization_level=args.amp_opt_level,
        log_dir=log_dir,
        checkpoint_dir=args.checkpoint_dir,
        create_tb_writer=args.create_tb_writer,
        files_to_copy=[args.model_config, __file__],
        cudnn_benchmark=args.cudnn_benchmark,
        tensorboard_dir=args.tensorboard_dir,
    )

    if args.local_rank is not None:
        logging.info('Doing ALL GPU')

    yaml = YAML(typ="safe")
    with open(args.model_config) as file:
        tacotron2_params = yaml.load(file)
        labels = tacotron2_params["labels"]
    # instantiate neural modules
    neural_modules = create_NMs(args.model_config, labels)

    # build dags
    train_loss, callbacks, steps_per_epoch = create_all_dags(
        neural_factory=neural_factory,
        neural_modules=neural_modules,
        tacotron2_config_file=args.model_config,
        train_dataset=args.train_dataset,
        batch_size=args.batch_size,
        eval_freq=args.eval_freq,
        checkpoint_save_freq=args.checkpoint_save_freq,
        eval_datasets=args.eval_datasets,
        eval_batch_size=args.eval_batch_size,
        labels=labels,
    )

    # train model
    total_steps = args.max_steps if args.max_steps is not None else args.num_epochs * steps_per_epoch
    neural_factory.train(
        tensors_to_optimize=[train_loss],
        callbacks=callbacks,
        lr_policy=CosineAnnealing(total_steps, min_lr=args.min_lr),
        optimizer=args.optimizer,
        optimization_params={
            "num_epochs": args.num_epochs,
            "max_steps": args.max_steps,
            "lr": args.lr,
            "weight_decay": args.weight_decay,
            "grad_norm_clip": args.grad_norm_clip,
        },
        batches_per_step=args.iter_per_step,
    )
Exemple #2
0
def main():
    args = parse_args()
    name = construct_name(
        args.exp_name,
        args.lr,
        args.batch_size,
        args.max_steps,
        args.num_epochs,
        args.weight_decay,
        args.optimizer,
        args.iter_per_step,
    )
    log_dir = name
    if args.work_dir:
        log_dir = os.path.join(args.work_dir, name)

    # instantiate Neural Factory with supported backend
    neural_factory = nemo.core.NeuralModuleFactory(
        backend=nemo.core.Backend.PyTorch,
        local_rank=args.local_rank,
        optimization_level=args.amp_opt_level,
        log_dir=log_dir,
        checkpoint_dir=args.checkpoint_dir,
        create_tb_writer=args.create_tb_writer,
        files_to_copy=[args.model_config, __file__],
        cudnn_benchmark=args.cudnn_benchmark,
        tensorboard_dir=args.tensorboard_dir,
    )
    args.num_gpus = neural_factory.world_size

    checkpoint_dir = neural_factory.checkpoint_dir
    if args.local_rank is not None:
        logging.info('Doing ALL GPU')

    # build dags
    train_loss, callbacks, steps_per_epoch = create_all_dags(
        args, neural_factory)

    # train model
    neural_factory.train(
        tensors_to_optimize=[train_loss],
        callbacks=callbacks,
        lr_policy=CosineAnnealing(
            args.max_steps if args.max_steps is not None else args.num_epochs *
            steps_per_epoch,
            warmup_steps=args.warmup_steps,
        ),
        optimizer=args.optimizer,
        optimization_params={
            "num_epochs": args.num_epochs,
            "max_steps": args.max_steps,
            "lr": args.lr,
            "betas": (args.beta1, args.beta2),
            "weight_decay": args.weight_decay,
            "grad_norm_clip": None,
        },
        batches_per_step=args.iter_per_step,
    )
Exemple #3
0
def main():
    args = parse_args()

    print(args)
    emb_size = 1024
    name = construct_name(
        args.exp_name, args.lr, args.batch_size, args.num_epochs, args.weight_decay, args.optimizer, emb_size=emb_size,
    )
    work_dir = name
    if args.work_dir:
        work_dir = os.path.join(args.work_dir, name)

    # instantiate Neural Factory with supported backend
    neural_factory = nemo.core.NeuralModuleFactory(
        backend=nemo.core.Backend.PyTorch,
        local_rank=args.local_rank,
        optimization_level=args.amp_opt_level,
        log_dir=work_dir,
        checkpoint_dir=args.checkpoint_dir + "/" + args.exp_name,
        create_tb_writer=args.create_tb_writer,
        files_to_copy=[args.model_config, __file__],
        random_seed=42,
        cudnn_benchmark=args.cudnn_benchmark,
        tensorboard_dir=args.tensorboard_dir + "/" + name,
    )
    args.num_gpus = neural_factory.world_size

    args.checkpoint_dir = neural_factory.checkpoint_dir

    if args.local_rank is not None:
        logging.info("Doing ALL GPU")

    # build dags
    (train_loss, callbacks, steps_per_epoch, loss_test, logits_test, label_test,) = create_all_dags(
        args, neural_factory
    )

    # train model
    neural_factory.train(
        tensors_to_optimize=[train_loss],
        callbacks=callbacks,
        lr_policy=CosineAnnealing(
            args.num_epochs * steps_per_epoch, warmup_steps=0.1 * args.num_epochs * steps_per_epoch,
        ),
        optimizer=args.optimizer,
        optimization_params={
            "num_epochs": args.num_epochs,
            "lr": args.lr,
            "betas": (args.beta1, args.beta2),
            "weight_decay": args.weight_decay,
            "grad_norm_clip": None,
        },
        batches_per_step=args.iter_per_step,
        synced_batchnorm=args.synced_bn,
        synced_batchnorm_groupsize=args.synced_bn_groupsize,
    )
Exemple #4
0
callback_eval = nemo.core.EvaluatorCallback(
    eval_tensors=[tgt_, eval_loss, beam_trans, sent_ids_],
    user_iter_callback=lambda x, y: eval_iter_callback(x, y, tokenizer),
    user_epochs_done_callback=lambda x: eval_epochs_done_callback(
        x, validation_dataset=valid_dataset),
    eval_step=args.eval_freq,
    tb_writer=tb_writer)

# callback which saves checkpoints once in a while
callback_ckpt = nemo.core.CheckpointCallback(
    folder=args.checkpoint_dir,
    step_freq=args.checkpoint_save_freq,
    checkpoints_to_keep=1)

# define learning rate decay policy
lr_policy = CosineAnnealing(args.max_steps, warmup_steps=args.warmup_steps)

# define and launch training algorithm (optimizer)
max_num_epochs = 0 if args.interactive else args.num_epochs

optimizer = neural_factory.get_trainer()

callbacks = [callback_ckpt]
if not args.interactive:
    callbacks.extend([callback_train, callback_eval])
optimizer.train(tensors_to_optimize=[train_loss],
                callbacks=callbacks,
                optimizer=args.optimizer,
                lr_policy=lr_policy,
                optimization_params={
                    "num_epochs": max_num_epochs,
Exemple #5
0
def main():
    parser = argparse.ArgumentParser(parents=[nm_argparse.NemoArgParser()],
                                     description='AN4 ASR',
                                     conflict_handler='resolve')

    # Overwrite default args
    parser.add_argument("--train_dataset",
                        type=str,
                        help="training dataset path")
    parser.add_argument("--eval_datasets",
                        type=str,
                        nargs=1,
                        help="validation dataset path")

    # Create new args
    parser.add_argument("--lm", default="./an4-lm.3gram.binary", type=str)
    parser.add_argument("--test_after_training", action='store_true')
    parser.add_argument("--momentum", type=float)
    parser.add_argument("--beta1", default=0.95, type=float)
    parser.add_argument("--beta2", default=0.25, type=float)
    parser.set_defaults(
        model_config="./configs/jasper_an4.yaml",
        train_dataset="/home/mrjenkins/TestData/an4_dataset/an4_train.json",
        eval_datasets="/home/mrjenkins/TestData/an4_dataset/an4_val.json",
        work_dir="./tmp",
        checkpoint_dir="./tmp",
        optimizer="novograd",
        num_epochs=50,
        batch_size=32,
        eval_batch_size=16,
        lr=0.02,
        weight_decay=0.005,
        checkpoint_save_freq=1000,
        eval_freq=100,
        amp_opt_level="O1")

    args = parser.parse_args()
    betas = (args.beta1, args.beta2)

    wer_thr = 0.20
    beam_wer_thr = 0.15

    nf = nemo.core.NeuralModuleFactory(local_rank=args.local_rank,
                                       optimization_level=args.amp_opt_level,
                                       random_seed=0,
                                       log_dir=args.work_dir,
                                       checkpoint_dir=args.checkpoint_dir,
                                       create_tb_writer=True,
                                       cudnn_benchmark=args.cudnn_benchmark)
    tb_writer = nf.tb_writer
    checkpoint_dir = nf.checkpoint_dir
    args.checkpoint_dir = nf.checkpoint_dir

    # Load model definition
    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)

    vocab = jasper_params['labels']
    sample_rate = jasper_params['sample_rate']

    # build train and eval model
    train_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    train_dl_params.update(jasper_params["AudioToTextDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]

    data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.train_dataset,
        sample_rate=sample_rate,
        labels=vocab,
        batch_size=args.batch_size,
        **train_dl_params)

    num_samples = len(data_layer)
    total_steps = int(num_samples * args.num_epochs / args.batch_size)
    print("Train samples=", num_samples, "num_steps=", total_steps)

    data_preprocessor = nemo_asr.AudioPreprocessing(
        sample_rate=sample_rate, **jasper_params["AudioPreprocessing"])

    # data_augmentation = nemo_asr.SpectrogramAugmentation(
    #     **jasper_params['SpectrogramAugmentation']
    # )

    eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"])
    del eval_dl_params["train"]
    del eval_dl_params["eval"]

    data_layer_eval = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.eval_datasets,
        sample_rate=sample_rate,
        labels=vocab,
        batch_size=args.eval_batch_size,
        **eval_dl_params)

    num_samples = len(data_layer_eval)
    nf.logger.info(f"Eval samples={num_samples}")

    jasper_encoder = nemo_asr.JasperEncoder(
        feat_in=jasper_params["AudioPreprocessing"]["features"],
        **jasper_params["JasperEncoder"])

    jasper_decoder = nemo_asr.JasperDecoderForCTC(
        feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"],
        num_classes=len(vocab))

    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))

    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    # Training model
    audio, audio_len, transcript, transcript_len = data_layer()
    processed, processed_len = data_preprocessor(input_signal=audio,
                                                 length=audio_len)
    encoded, encoded_len = jasper_encoder(audio_signal=processed,
                                          length=processed_len)
    log_probs = jasper_decoder(encoder_output=encoded)
    predictions = greedy_decoder(log_probs=log_probs)
    loss = ctc_loss(log_probs=log_probs,
                    targets=transcript,
                    input_length=encoded_len,
                    target_length=transcript_len)

    # Evaluation model
    audio_e, audio_len_e, transcript_e, transcript_len_e = data_layer_eval()
    processed_e, processed_len_e = data_preprocessor(input_signal=audio_e,
                                                     length=audio_len_e)
    encoded_e, encoded_len_e = jasper_encoder(audio_signal=processed_e,
                                              length=processed_len_e)
    log_probs_e = jasper_decoder(encoder_output=encoded_e)
    predictions_e = greedy_decoder(log_probs=log_probs_e)
    loss_e = ctc_loss(log_probs=log_probs_e,
                      targets=transcript_e,
                      input_length=encoded_len_e,
                      target_length=transcript_len_e)
    nf.logger.info("Num of params in encoder: {0}".format(
        jasper_encoder.num_weights))

    # Callbacks to print info to console and Tensorboard
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[loss, predictions, transcript, transcript_len],
        print_func=lambda x: monitor_asr_train_progress(x, labels=vocab),
        get_tb_values=lambda x: [["loss", x[0]]],
        tb_writer=tb_writer,
    )

    checkpointer_callback = nemo.core.CheckpointCallback(
        folder=checkpoint_dir, step_freq=args.checkpoint_save_freq)

    eval_tensors = [loss_e, predictions_e, transcript_e, transcript_len_e]
    eval_callback = nemo.core.EvaluatorCallback(
        eval_tensors=eval_tensors,
        user_iter_callback=lambda x, y: process_evaluation_batch(
            x, y, labels=vocab),
        user_epochs_done_callback=process_evaluation_epoch,
        eval_step=args.eval_freq,
        tb_writer=tb_writer)

    nf.train(tensors_to_optimize=[loss],
             callbacks=[train_callback, eval_callback, checkpointer_callback],
             optimizer=args.optimizer,
             lr_policy=CosineAnnealing(total_steps=total_steps),
             optimization_params={
                 "num_epochs": args.num_epochs,
                 "max_steps": args.max_steps,
                 "lr": args.lr,
                 "momentum": args.momentum,
                 "betas": betas,
                 "weight_decay": args.weight_decay,
                 "grad_norm_clip": None
             },
             batches_per_step=args.iter_per_step)

    if args.test_after_training:
        # Create BeamSearch NM
        beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM(
            vocab=vocab,
            beam_width=64,
            alpha=2.,
            beta=1.5,
            lm_path=args.lm,
            num_cpus=max(os.cpu_count(), 1))
        beam_predictions = beam_search_with_lm(log_probs=log_probs_e,
                                               log_probs_length=encoded_len_e)
        eval_tensors.append(beam_predictions)

        evaluated_tensors = nf.infer(eval_tensors)
        greedy_hypotheses = post_process_predictions(evaluated_tensors[1],
                                                     vocab)
        references = post_process_transcripts(evaluated_tensors[2],
                                              evaluated_tensors[3], vocab)
        wer = word_error_rate(hypotheses=greedy_hypotheses,
                              references=references)
        nf.logger.info("Greedy WER: {:.2f}".format(wer * 100))
        assert wer <= wer_thr, (
            "Final eval greedy WER {:.2f}% > than {:.2f}%".format(
                wer * 100, wer_thr * 100))

        beam_hypotheses = []
        # Over mini-batch
        for i in evaluated_tensors[-1]:
            # Over samples
            for j in i:
                beam_hypotheses.append(j[0][1])

        beam_wer = word_error_rate(hypotheses=beam_hypotheses,
                                   references=references)
        nf.logger.info("Beam WER {:.2f}%".format(beam_wer * 100))
        assert beam_wer <= beam_wer_thr, (
            "Final eval beam WER {:.2f}%  > than {:.2f}%".format(
                beam_wer * 100, beam_wer_thr * 100))
        assert beam_wer <= wer, ("Final eval beam WER > than the greedy WER.")

        # Reload model weights and train for extra 10 epochs
        checkpointer_callback = nemo.core.CheckpointCallback(
            folder=checkpoint_dir,
            step_freq=args.checkpoint_save_freq,
            force_load=True)

        nf.reset_trainer()
        nf.train(tensors_to_optimize=[loss],
                 callbacks=[train_callback, checkpointer_callback],
                 optimizer=args.optimizer,
                 optimization_params={
                     "num_epochs": args.num_epochs + 10,
                     "lr": args.lr,
                     "momentum": args.momentum,
                     "betas": betas,
                     "weight_decay": args.weight_decay,
                     "grad_norm_clip": None
                 },
                 reset=True)

        evaluated_tensors = nf.infer(eval_tensors[:-1])
        greedy_hypotheses = post_process_predictions(evaluated_tensors[1],
                                                     vocab)
        references = post_process_transcripts(evaluated_tensors[2],
                                              evaluated_tensors[3], vocab)
        wer_new = word_error_rate(hypotheses=greedy_hypotheses,
                                  references=references)
        nf.logger.info("New greedy WER: {:.2f}%".format(wer_new * 100))
        assert wer_new <= wer * 1.1, (
            f"Fine tuning: new WER {wer * 100:.2f}% > than the previous WER "
            f"{wer_new * 100:.2f}%")
Exemple #6
0
def main():
    args = parse_args()
    name = construct_name(
        args.exp_name,
        args.lr,
        args.batch_size,
        args.max_steps,
        args.num_epochs,
        args.weight_decay,
        args.optimizer,
        args.iter_per_step,
    )

    # time stamp
    date_time = datetime.now().strftime("%m-%d-%Y -- %H-%M-%S")

    log_dir = name
    if args.work_dir:
        log_dir = os.path.join(args.work_dir, name)

    if args.tensorboard_dir is None:
        tensorboard_dir = os.path.join(name, 'tensorboard', date_time)
    else:
        tensorboard_dir = args.tensorboard_dir

    if args.checkpoint_dir is None:
        checkpoint_dir = os.path.join(name, date_time)
    else:
        base_checkpoint_dir = args.checkpoint_dir
        if len(glob.glob(os.path.join(base_checkpoint_dir, '*.pt'))) > 0:
            checkpoint_dir = base_checkpoint_dir
        else:
            checkpoint_dir = os.path.join(args.checkpoint_dir, date_time)

    # instantiate Neural Factory with supported backend
    neural_factory = nemo.core.NeuralModuleFactory(
        backend=nemo.core.Backend.PyTorch,
        local_rank=args.local_rank,
        optimization_level=args.amp_opt_level,
        log_dir=log_dir,
        checkpoint_dir=checkpoint_dir,
        create_tb_writer=args.create_tb_writer,
        files_to_copy=[args.model_config, __file__],
        cudnn_benchmark=args.cudnn_benchmark,
        tensorboard_dir=tensorboard_dir,
    )
    args.num_gpus = neural_factory.world_size

    if args.local_rank is not None:
        logging.info('Doing ALL GPU')

    # build dags
    train_loss, callbacks, steps_per_epoch = create_all_dags(
        args, neural_factory)

    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)

    lr_schedule = jasper_params.get('lr_schedule', 'CosineAnnealing')

    if lr_schedule == 'CosineAnnealing':
        lr_policy = CosineAnnealing(
            total_steps=args.max_steps if args.max_steps is not None else
            args.num_epochs * steps_per_epoch,
            warmup_ratio=args.warmup_ratio,
            min_lr=args.min_lr,
        )
    elif lr_schedule == 'PolynomialDecayAnnealing':
        lr_policy = PolynomialDecayAnnealing(
            total_steps=args.max_steps if args.max_steps is not None else
            args.num_epochs * steps_per_epoch,
            warmup_ratio=args.warmup_ratio,
            min_lr=args.min_lr,
            power=2.0,
        )
    elif lr_schedule == 'PolynomialHoldDecayAnnealing':
        lr_policy = PolynomialHoldDecayAnnealing(
            total_steps=args.max_steps if args.max_steps is not None else
            args.num_epochs * steps_per_epoch,
            warmup_ratio=args.warmup_ratio,
            hold_ratio=args.hold_ratio,
            min_lr=args.min_lr,
            power=2.0,
        )
    else:
        raise ValueError("LR schedule is invalid !")

    logging.info(f"Using `{lr_policy}` Learning Rate Scheduler")

    # train model
    neural_factory.train(
        tensors_to_optimize=[train_loss],
        callbacks=callbacks,
        lr_policy=lr_policy,
        optimizer=args.optimizer,
        optimization_params={
            "num_epochs": args.num_epochs,
            "max_steps": args.max_steps,
            "lr": args.lr,
            "momentum": 0.95,
            "betas": (args.beta1, args.beta2),
            "weight_decay": args.weight_decay,
            "grad_norm_clip": None,
        },
        batches_per_step=args.iter_per_step,
    )
Exemple #7
0
                      (args.batch_size * args.num_gpus * args.batch_per_step))

callback_dev = nemo.core.EvaluatorCallback(
    # eval_tensors=[dev_mlm_loss, dev_nsp_loss],
    eval_tensors=[dev_mlm_loss],
    user_iter_callback=eval_iter_callback,
    user_epochs_done_callback=eval_epochs_done_callback,
    eval_step=steps_per_epoch,
    tb_writer=tb_writer)

# define learning rate decay policy
if args.lr_decay_policy == "poly":
    lr_policy = SquareAnnealing(args.num_epochs * steps_per_epoch,
                                warmup_ratio=args.lr_warmup_proportion)
elif args.lr_decay_policy == "cosine":
    lr_policy = CosineAnnealing(args.num_epochs * steps_per_epoch,
                                warmup_ratio=args.lr_warmup_proportion)
elif args.lr_decay_policy == "noam":
    lr_policy = \
        InverseSquareRootAnnealing(args.num_epochs * steps_per_epoch,
                                   warmup_ratio=args.lr_warmup_proportion)
else:
    raise NotImplementedError

# save config file
if not os.path.exists(args.checkpoint_directory):
    os.makedirs(args.checkpoint_directory)

config_path = os.path.join(args.checkpoint_directory, "bert-config.json")
if not os.path.exists(config_path):
    bert_model.config.to_json_file(config_path)
Exemple #8
0
def main():
    parser = argparse.ArgumentParser(parents=[nm_argparse.NemoArgParser()],
                                     description='AN4 ASR',
                                     conflict_handler='resolve')

    # Overwrite default args
    parser.add_argument("--train_dataset",
                        type=str,
                        help="training dataset path")
    parser.add_argument("--eval_datasets",
                        type=str,
                        nargs=1,
                        help="validation dataset path")

    # Create new args
    parser.add_argument("--lm", default="./an4-lm.3gram.binary", type=str)
    parser.add_argument("--test_after_training", action='store_true')
    parser.add_argument("--momentum", type=float)
    parser.add_argument("--beta1", default=0.95, type=float)
    parser.add_argument("--beta2", default=0.25, type=float)
    parser.set_defaults(
        model_config="./configs/jasper_an4.yaml",
        train_dataset="/home/mrjenkins/TestData/an4_dataset/an4_train.json",
        eval_datasets="/home/mrjenkins/TestData/an4_dataset/an4_val.json",
        work_dir="./tmp",
        optimizer="novograd",
        num_epochs=50,
        batch_size=48,
        eval_batch_size=64,
        lr=0.02,
        weight_decay=0.005,
        checkpoint_save_freq=1000,
        eval_freq=100,
        amp_opt_level="O1")

    args = parser.parse_args()
    betas = (args.beta1, args.beta2)

    wer_thr = 0.20
    beam_wer_thr = 0.15

    nf = nemo.core.NeuralModuleFactory(local_rank=args.local_rank,
                                       files_to_copy=[__file__],
                                       optimization_level=args.amp_opt_level,
                                       random_seed=0,
                                       log_dir=args.work_dir,
                                       create_tb_writer=True,
                                       cudnn_benchmark=args.cudnn_benchmark)
    tb_writer = nf.tb_writer
    checkpoint_dir = nf.checkpoint_dir

    # Load model definition
    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)

    (loss, eval_tensors, callbacks, total_steps, vocab, log_probs_e,
     encoded_len_e) = create_dags(jasper_params, args, nf)

    nf.train(
        tensors_to_optimize=[loss],
        callbacks=callbacks,
        optimizer=args.optimizer,
        lr_policy=CosineAnnealing(total_steps=total_steps,
                                  min_lr=args.lr / 100),
        optimization_params={
            "num_epochs": args.num_epochs,
            "max_steps": args.max_steps,
            "lr": args.lr,
            "momentum": args.momentum,
            "betas": betas,
            "weight_decay": args.weight_decay,
            "grad_norm_clip": None
        },
        batches_per_step=args.iter_per_step,
        amp_max_loss_scale=256.,
        # synced_batchnorm=(nf.global_rank is not None),
    )

    if args.test_after_training:
        nemo.logging.info("Testing greedy and beam search with LM WER.")
        # Create BeamSearch NM
        if nf.world_size > 1:
            nemo.logging.warning("Skipping beam search WER as it does not "
                                 "work if doing distributed training.")
        else:
            beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM(
                vocab=vocab,
                beam_width=64,
                alpha=2.,
                beta=1.5,
                lm_path=args.lm,
                num_cpus=max(os.cpu_count(), 1))
            beam_predictions = beam_search_with_lm(
                log_probs=log_probs_e, log_probs_length=encoded_len_e)
            eval_tensors.append(beam_predictions)

        evaluated_tensors = nf.infer(eval_tensors)
        if nf.global_rank in [0, None]:
            greedy_hypotheses = post_process_predictions(
                evaluated_tensors[1], vocab)
            references = post_process_transcripts(evaluated_tensors[2],
                                                  evaluated_tensors[3], vocab)
            wer = word_error_rate(hypotheses=greedy_hypotheses,
                                  references=references)
            nemo.logging.info("Greedy WER: {:.2f}%".format(wer * 100))
            if wer > wer_thr:
                nf.sync_all_processes(False)
                raise ValueError(f"Final eval greedy WER {wer*100:.2f}% > :"
                                 f"than {wer_thr*100:.2f}%")
        nf.sync_all_processes()

        if nf.world_size == 1:
            beam_hypotheses = []
            # Over mini-batch
            for i in evaluated_tensors[-1]:
                # Over samples
                for j in i:
                    beam_hypotheses.append(j[0][1])

            beam_wer = word_error_rate(hypotheses=beam_hypotheses,
                                       references=references)
            nemo.logging.info("Beam WER {:.2f}%".format(beam_wer * 100))
            assert beam_wer <= beam_wer_thr, (
                "Final eval beam WER {:.2f}%  > than {:.2f}%".format(
                    beam_wer * 100, beam_wer_thr * 100))
            assert beam_wer <= wer, (
                "Final eval beam WER > than the greedy WER.")

        # Reload model weights and train for extra 10 epochs
        checkpointer_callback = nemo.core.CheckpointCallback(
            folder=checkpoint_dir,
            step_freq=args.checkpoint_save_freq,
            force_load=True)

        # Distributed Data Parallel changes the underlying class so we need
        # to reinstantiate Encoder and Decoder
        args.num_epochs += 10
        previous_step_count = total_steps
        loss, eval_tensors, callbacks, total_steps, vocab, _, _ = create_dags(
            jasper_params, args, nf)

        nf.reset_trainer()
        nf.train(
            tensors_to_optimize=[loss],
            callbacks=callbacks,
            optimizer=args.optimizer,
            lr_policy=CosineAnnealing(warmup_steps=previous_step_count,
                                      total_steps=total_steps),
            optimization_params={
                "num_epochs": args.num_epochs,
                "lr": args.lr / 100,
                "momentum": args.momentum,
                "betas": betas,
                "weight_decay": args.weight_decay,
                "grad_norm_clip": None
            },
            reset=True,
            amp_max_loss_scale=256.,
            # synced_batchnorm=(nf.global_rank is not None),
        )

        evaluated_tensors = nf.infer(eval_tensors)
        if nf.global_rank in [0, None]:
            greedy_hypotheses = post_process_predictions(
                evaluated_tensors[1], vocab)
            references = post_process_transcripts(evaluated_tensors[2],
                                                  evaluated_tensors[3], vocab)
            wer_new = word_error_rate(hypotheses=greedy_hypotheses,
                                      references=references)
            nemo.logging.info("New greedy WER: {:.2f}%".format(wer_new * 100))
            if wer_new > wer * 1.1:
                nf.sync_all_processes(False)
                raise ValueError(
                    f"Fine tuning: new WER {wer_new* 100:.2f}% > than the "
                    f"previous WER {wer * 100:.2f}%")
        nf.sync_all_processes()

        # Open the log file and ensure that epochs is strictly increasing
        if nf._exp_manager.log_file:
            epochs = []
            with open(nf._exp_manager.log_file, "r") as log_file:
                line = log_file.readline()
                while line:
                    index = line.find("Starting epoch")
                    if index != -1:
                        epochs.append(int(line[index +
                                               len("Starting epoch"):]))
                    line = log_file.readline()
            for i, e in enumerate(epochs):
                if i != e:
                    raise ValueError("Epochs from logfile was not understood")
Exemple #9
0
        }
    })

train_data_size = len(train_data_layer)
steps_per_epoch = int(train_data_size / (args.batch_size * args.num_gpus))

print("steps_per_epoch =", steps_per_epoch)

callback_eval = nemo.core.EvaluatorCallback(
    eval_tensors=[eval_logits, eval_seq_ids],
    user_iter_callback=lambda x, y: eval_iter_callback(x, y, eval_data_layer,
                                                       tag_ids),
    user_epochs_done_callback=lambda x: eval_epochs_done_callback(
        x, tag_ids, args.output_filename),
    tb_writer=tb_writer,
    eval_step=steps_per_epoch)

if args.lr_policy == "lr_warmup":
    lr_policy_func = WarmupAnnealing(args.num_epochs * steps_per_epoch,
                                     warmup_ratio=args.lr_warmup_proportion)
elif args.lr_policy == "lr_poly":
    lr_policy_func = SquareAnnealing(args.num_epochs * steps_per_epoch)
elif args.lr_policy == "lr_cosine":
    lr_policy_func = CosineAnnealing(args.num_epochs * steps_per_epoch)
else:
    raise ValueError("Invalid lr_policy, must be lr_warmup or lr_poly")

optimizer.train(tensors_to_optimize=[train_loss],
                callbacks=[callback_train, callback_eval],
                lr_policy=lr_policy_func)
Exemple #10
0
def main():
    # Usage and Command line arguments
    parser = ArgumentParser()
    parser.add_argument(
        "--asr_model",
        type=str,
        default="QuartzNet15x5-En",
        required=True,
        help=
        "Pass: '******', 'QuartzNet15x5-Zh', or 'JasperNet10x5-En' to train from pre-trained models. To train from scratch pass path to modelfile ending with .yaml.",
    )
    parser.add_argument(
        "--amp_opt_level",
        default="O0",
        type=str,
        choices=["O0", "O1", "O2", "O3"],
        help="See: https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--train_dataset",
                        type=str,
                        required=True,
                        default=None,
                        help="training dataset path")
    parser.add_argument("--eval_datasets",
                        type=str,
                        nargs="*",
                        help="evaluation datasets paths")
    parser.add_argument("--eval_freq",
                        default=1000,
                        type=int,
                        help="Evaluation frequency")
    parser.add_argument("--eval_batch_size",
                        type=int,
                        default=8,
                        help="batch size to use for evaluation")
    parser.add_argument("--local_rank",
                        default=None,
                        type=int,
                        help="node rank for distributed training")
    parser.add_argument("--stats_freq",
                        default=25,
                        type=int,
                        help="frequency with which to update train stats")
    parser.add_argument("--checkpoint_dir",
                        default=None,
                        type=str,
                        help="Folder where to save checkpoints")
    parser.add_argument("--checkpoint_save_freq",
                        required=False,
                        type=int,
                        help="how often to checkpoint")
    parser.add_argument("--optimizer", default="novograd", type=str)
    parser.add_argument("--warmup_ratio",
                        default=0.02,
                        type=float,
                        help="learning rate warmup ratio")
    parser.add_argument("--batch_size",
                        required=True,
                        type=int,
                        help="train batch size per GPU")
    parser.add_argument("--num_epochs",
                        default=5,
                        type=int,
                        help="number of epochs to train")
    parser.add_argument("--lr", default=0.01, type=float)
    parser.add_argument("--beta1", default=0.95, type=float)
    parser.add_argument("--beta2", default=0.5, type=float)
    parser.add_argument("--weight_decay", default=0.001, type=float)
    parser.add_argument("--iter_per_step",
                        default=1,
                        type=int,
                        help="number of grad accumulations per batch")
    parser.add_argument("--wandb_exp_name", default=None, type=str)
    parser.add_argument("--wandb_project", default=None, type=str)
    parser.add_argument("--max_train_audio_len",
                        default=16.7,
                        type=float,
                        help="max audio length")
    parser.add_argument("--do_not_trim_silence",
                        action="store_false",
                        help="Add this flag to disable silence trimming")
    parser.add_argument("--do_not_normalize_text",
                        action="store_false",
                        help="Add this flag to set to False for non-English.")
    args = parser.parse_args()

    # Setup NeuralModuleFactory to control training
    # instantiate Neural Factory with supported backend
    nf = nemo.core.NeuralModuleFactory(
        local_rank=args.
        local_rank,  # This is necessary for distributed training
        optimization_level=args.
        amp_opt_level,  # This is necessary for mixed precision optimization
        cudnn_benchmark=True,
    )

    # Instantiate the model which we'll train
    if args.asr_model.endswith('.yaml'):
        logging.info(
            f"Speech2Text: Will train from scratch using config from {args.asr_model}"
        )
        asr_model = nemo_asr.models.ASRConvCTCModel.import_from_config(
            args.asr_model)
    else:
        logging.info(f"Speech2Text: Will fine-tune from {args.asr_model}")
        asr_model = nemo_asr.models.ASRConvCTCModel.from_pretrained(
            model_info=args.asr_model, local_rank=args.local_rank)

    if args.asr_model.strip().endswith('-Zh'):
        logging.info('USING CER')
        eval_metric = 'CER'
    else:
        eval_metric = 'WER'

    logging.info("\n\n")
    logging.info(f"Speech2Text: Training on {nf.world_size} GPUs.")
    logging.info(f"Training {type(asr_model)} model.")
    logging.info(f"Training CTC model with alphabet {asr_model.vocabulary}.")
    logging.info(
        f"Training CTC model with {asr_model.num_weights} weights.\n\n")

    train_data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.train_dataset,
        labels=asr_model.vocabulary,
        batch_size=args.batch_size,
        trim_silence=args.do_not_trim_silence,
        max_duration=args.max_train_audio_len,
        shuffle=True,
        normalize_transcripts=args.do_not_normalize_text,
    )
    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(asr_model.vocabulary))
    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    audio_signal, audio_signal_len, transcript, transcript_len = train_data_layer(
    )
    log_probs, encoded_len = asr_model(input_signal=audio_signal,
                                       length=audio_signal_len)
    predictions = greedy_decoder(log_probs=log_probs)
    loss = ctc_loss(log_probs=log_probs,
                    targets=transcript,
                    input_length=encoded_len,
                    target_length=transcript_len)

    # Callbacks which we'll be using:
    callbacks = []
    # SimpleLossLogger prints basic training stats (e.g. loss) to console
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[loss, predictions, transcript, transcript_len],
        step_freq=args.stats_freq,
        print_func=partial(monitor_asr_train_progress,
                           labels=asr_model.vocabulary,
                           eval_metric=eval_metric),
    )
    callbacks.append(train_callback)
    if args.checkpoint_dir is not None and args.checkpoint_save_freq is not None:
        # Checkpoint callback saves checkpoints periodically
        checkpointer_callback = nemo.core.CheckpointCallback(
            folder=args.checkpoint_dir, step_freq=args.checkpoint_save_freq)
        callbacks.append(checkpointer_callback)

    if args.wandb_exp_name is not None and args.wandb_project is not None:
        # WandbCallback saves stats to Weights&Biases
        wandb_callback = nemo.core.WandBLogger(
            step_freq=args.stats_freq,
            wandb_name=args.wandb_exp_name,
            wandb_project=args.wandb_project,
            args=args)
        callbacks.append(wandb_callback)

    # Evaluation
    if args.eval_datasets is not None and args.eval_freq is not None:
        asr_model.eval()  # switch model to evaluation mode
        logging.info(f"Will perform evaluation every {args.eval_freq} steps.")
        for ind, eval_dataset in enumerate(args.eval_datasets):
            eval_data_layer = nemo_asr.AudioToTextDataLayer(
                manifest_filepath=eval_dataset,
                labels=asr_model.vocabulary,
                batch_size=args.eval_batch_size,
                normalize_transcripts=args.do_not_normalize_text,
            )
            audio_signal, audio_signal_len, transcript, transcript_len = eval_data_layer(
            )
            log_probs, encoded_len = asr_model(input_signal=audio_signal,
                                               length=audio_signal_len)
            eval_predictions = greedy_decoder(log_probs=log_probs)
            eval_loss = ctc_loss(log_probs=log_probs,
                                 targets=transcript,
                                 input_length=encoded_len,
                                 target_length=transcript_len)
            tag_name = os.path.basename(eval_dataset).split(".")[0]
            eval_callback = nemo.core.EvaluatorCallback(
                eval_tensors=[
                    eval_loss, eval_predictions, transcript, transcript_len
                ],
                user_iter_callback=partial(process_evaluation_batch,
                                           labels=asr_model.vocabulary),
                user_epochs_done_callback=partial(process_evaluation_epoch,
                                                  tag=tag_name,
                                                  eval_metric=eval_metric),
                eval_step=args.eval_freq,
                wandb_name=args.wandb_exp_name,
                wandb_project=args.wandb_project,
            )
            callbacks.append(eval_callback)

    steps_in_epoch = len(train_data_layer) / (
        args.batch_size * args.iter_per_step * nf.world_size)
    lr_policy = CosineAnnealing(total_steps=args.num_epochs * steps_in_epoch,
                                warmup_ratio=args.warmup_ratio)

    nf.train(
        tensors_to_optimize=[loss],
        callbacks=callbacks,
        optimizer=args.optimizer,
        optimization_params={
            "num_epochs": args.num_epochs,
            "lr": args.lr,
            "betas": (args.beta1, args.beta2),
            "weight_decay": args.weight_decay,
        },
        batches_per_step=args.iter_per_step,
        lr_policy=lr_policy,
    )
if args.cl:
    classify_callback = RunClassifierCallback(
        eval_step=100,
        name=args.name,
        num_classes=len(labels),
        gpu=args.classify_gpu,
        hidden_size=args.hidden_size,
        manifest=args.manifest,
        model=args.model
    )
    callbacks.append(classify_callback)

lr_policy = CosineAnnealing(
    total_steps=num_epochs * steps_per_epoch,
    warmup_ratio=0.05,
    min_lr=args.lr_end,
)

logging.info(f"Using `{lr_policy}` Learning Rate Scheduler")

neural_factory.train(
    tensors_to_optimize=[train_loss],
    callbacks=callbacks,
    lr_policy=lr_policy,
    optimizer="novograd",
    optimization_params={
        "num_epochs": num_epochs,
        "max_steps": None,
        "lr": lr,
        "momentum": 0.95,
eval_callback = nemo.core.EvaluatorCallback(
    eval_tensors=[loss_v, predictions_v, transcript_v, transcript_len_v],
    # how to process evaluation batch - e.g. compute WER
    user_iter_callback=partial(process_evaluation_batch, labels=labels),
    # how to aggregate statistics (e.g. WER) for the evaluation epoch
    user_epochs_done_callback=partial(process_evaluation_epoch,
                                      tag="DEV-CLEAN",
                                      logger=logger),
    eval_step=500,
    tb_writer=tb_writer)

# Run training using your Neural Factory
# Once this "action" is called data starts flowing along train and eval DAGs
# and computations start to happen
nf.train(
    # Specify the loss to optimize for
    tensors_to_optimize=[loss],
    # Specify which callbacks you want to run
    callbacks=[train_callback, eval_callback, saver_callback],
    # Specify what optimizer to use
    optimizer="novograd",
    # Specify optimizer parameters such as num_epochs and lr
    optimization_params={
        "num_epochs": 100,
        "lr": 0.02,
        "weight_decay": 1e-4,
        "grad_norm_clip": None
    },
    batches_per_step=8,
    lr_policy=CosineAnnealing(100 * int(len(data_layer._dataset) / (16. * 8)),
                              warmup_steps=1000))