Ejemplo n.º 1
0
def parse_args():
    parser = argparse.ArgumentParser(
        parents=[nm_argparse.NemoArgParser()],
        description='Jasper',
        conflict_handler='resolve')
    parser.set_defaults(
        checkpoint_dir=None,
        optimizer="novograd",
        batch_size=64,
        eval_batch_size=64,
        lr=0.02,
        amp_opt_level="O1",
        create_tb_writer=True
    )

    # Overwrite default args
    parser.add_argument("--max_steps", type=int, default=None, required=False,
                        help="max number of steps to train")
    parser.add_argument("--num_epochs", type=int, default=None, required=False,
                        help="number of epochs to train")
    parser.add_argument("--model_config", type=str, required=True,
                        help="model configuration file: model.yaml")

    # Create new args
    parser.add_argument("--exp_name", default="Jasper", type=str)
    parser.add_argument("--beta1", default=0.95, type=float)
    parser.add_argument("--beta2", default=0.25, type=float)
    parser.add_argument("--warmup_steps", default=0, type=int)

    args = parser.parse_args()

    if args.max_steps is not None and args.num_epochs is not None:
        raise ValueError("Either max_steps or num_epochs should be provided.")
    return args
Ejemplo n.º 2
0
def parse_args():
    parser = argparse.ArgumentParser(parents=[nm_argparse.NemoArgParser()],
                                     description='Jasper Aishell',
                                     conflict_handler='resolve')

    parser.set_defaults(model_config="./configs/jasper12x1SEP.yaml",
                        work_dir="./tmp",
                        checkpoint_dir="./tmp",
                        optimizer="novograd",
                        num_epochs=50,
                        batch_size=32,
                        eval_batch_size=16,
                        lr=0.015,
                        weight_decay=0.001,
                        warmup_steps=8000,
                        checkpoint_save_freq=1000,
                        train_eval_freq=50,
                        eval_freq=4000)

    # Create new args
    parser.add_argument("--vocab_file",
                        type=str,
                        required=True,
                        help="vocabulary file path")
    parser.add_argument("--exp_name", default="Jasper Aishell", type=str)
    parser.add_argument("--beta1", default=0.95, type=float)
    parser.add_argument("--beta2", default=0.25, type=float)
    parser.add_argument("--warmup_steps", default=0, type=int)

    args = parser.parse_args()
    if args.max_steps is not None:
        raise ValueError("Jasper uses num_epochs instead of max_steps")

    return args
Ejemplo n.º 3
0
def parse_args():
    parser = argparse.ArgumentParser(
        parents=[nm_argparse.NemoArgParser()],
        description='GarNet RnnLM',
        conflict_handler='resolve')
    parser.set_defaults(
        checkpoint_dir=None,
        optimizer="novograd",
        batch_size=32,
        eval_batch_size=32,
        num_epochs=25,
        weight_decay=1e-5,
        lr=0.02,
        amp_opt_level="O1",
        create_tb_writer=True
    )

    # Overwrite default args
    parser.add_argument("--num_epochs", type=int, default=None, required=True,
                        help="number of epochs to train. You should specify"
                             "either num_epochs or max_steps")
    parser.add_argument("--model_config", type=str, required=True,
                        help="model configuration file: model.yaml")
    parser.add_argument("--eval_datasets", type=str, required=True,
                        help="validation dataset path")

    # Create new args
    parser.add_argument("--exp_name", default="GarNet", type=str)
    parser.add_argument("--random_seed", default=0, type=float)

    args = parser.parse_args()
    if args.max_steps is not None:
        raise ValueError("GarNet RNNLM uses num_epochs instead of max_steps")

    return args
Ejemplo n.º 4
0
def parse_args():
    parser = argparse.ArgumentParser(
        description='FastSpeech training pipeline.',
        parents=[nm_argparse.NemoArgParser()],
        conflict_handler='resolve',  # For parents common flags.
    )
    parser.set_defaults(
        checkpoint_dir=None,
        optimizer='adam',
        batch_size=16,
        work_dir='fastspeech_output',
        eval_batch_size=32,
        num_epochs=10,
        lr=0.001,
        amp_opt_level='O0',
        create_tb_writer=True,
        lr_policy=None,
        weight_decay=1e-6,
    )

    parser.add_argument('--id', type=str, default='default', help="Experiment identificator for clarity.")
    parser.add_argument('--durations_dir', type=str, help="Train dataset durations directory path.")
    parser.add_argument('--grad_norm_clip', type=float, default=1.0, help="Gradient clipping.")
    parser.add_argument('--min_lr', type=float, default=1e-5, help="Minimum learning rate to decay to.")

    args = parser.parse_args()

    return args
Ejemplo n.º 5
0
def parse_args():
    parser = argparse.ArgumentParser(
        parents=[nm_argparse.NemoArgParser()],
        description='QuartzNet',
        conflict_handler='resolve')
    parser.set_defaults(
        checkpoint_dir=None,
        optimizer="novograd",
        batch_size=32,
        eval_batch_size=64,
        lr=0.01,
        weight_decay=0.001,
        amp_opt_level="O0",
        create_tb_writer=True
    )

    # Overwrite default args
    parser.add_argument("--num_epochs", type=int, default=None, required=True,
                        help="number of epochs to train. You should specify"
                             "either num_epochs or max_steps")
    parser.add_argument("--model_config", type=str, required=True,
                        help="model configuration file: model.yaml")

    # Create new args
    parser.add_argument("--exp_name", default="QuartzNet", type=str)
    parser.add_argument("--beta1", default=0.95, type=float)
    parser.add_argument("--beta2", default=0.5, type=float)
    parser.add_argument("--warmup_steps", default=1000, type=int)
    parser.add_argument("--load_dir", default=None, type=str)

    args = parser.parse_args()
    if args.max_steps is not None:
        raise ValueError("QuartzNet uses num_epochs instead of max_steps")

    return args
Ejemplo n.º 6
0
def parse_args():
    parser = argparse.ArgumentParser(
        parents=[nm_argparse.NemoArgParser()],
        description='Jasper Speech Commands',
        conflict_handler='resolve',
    )
    parser.set_defaults(
        checkpoint_dir=None,
        optimizer="sgd",
        batch_size=128,
        eval_batch_size=128,
        lr=0.1,
        amp_opt_level="O1",
        create_tb_writer=True,
    )

    # Overwrite default args
    parser.add_argument(
        "--max_steps",
        type=int,
        default=None,
        required=False,
        help="max number of steps to train",
    )
    parser.add_argument(
        "--num_epochs",
        type=int,
        default=None,
        required=False,
        help="number of epochs to train",
    )
    parser.add_argument(
        "--model_config",
        type=str,
        required=True,
        help="model configuration file: model.yaml",
    )

    # Create new args
    parser.add_argument("--exp_name",
                        default="Jasper_Speech_Commands",
                        type=str)
    parser.add_argument('--min_lr', default=1e-3, type=float)
    parser.add_argument("--beta1", default=0.95, type=float)
    parser.add_argument("--beta2", default=0.5, type=float)
    parser.add_argument("--warmup_ratio", default=0.0, type=float)
    parser.add_argument("--hold_ratio", default=0.0, type=float)
    parser.add_argument(
        "--load_dir",
        default=None,
        type=str,
        help="directory with pre-trained checkpoint",
    )

    args = parser.parse_args()

    if args.max_steps is not None and args.num_epochs is not None:
        raise ValueError("Either max_steps or num_epochs should be provided.")
    return args
Ejemplo n.º 7
0
def parse_args():
    parser = argparse.ArgumentParser(
        parents=[nm_argparse.NemoArgParser()],
        description='Waveglow',
        conflict_handler='resolve',
    )
    parser.set_defaults(
        checkpoint_dir=None,
        optimizer="adam",
        batch_size=12,
        eval_batch_size=12,
        lr=0.0001,
        amp_opt_level="O1",
        create_tb_writer=True,
        lr_policy=None,
        weight_decay=1e-6,
    )

    # Overwrite default args
    parser.add_argument("--max_steps",
                        type=int,
                        default=None,
                        help="max number of steps to train")
    parser.add_argument("--num_epochs",
                        type=int,
                        default=None,
                        help="number of epochs to train")
    parser.add_argument("--model_config",
                        type=str,
                        required=True,
                        help="model configuration file: model.yaml")

    # Create new args
    parser.add_argument("--exp_name", default="Waveglow", type=str)

    args = parser.parse_args()

    if args.lr_policy:
        raise NotImplementedError("Waveglow does not support lr policy arg")
    if args.max_steps is not None and args.num_epochs is not None:
        raise ValueError("Either max_steps or num_epochs should be provided.")
    if args.eval_freq % 25 != 0:
        raise ValueError("eval_freq should be a multiple of 25.")

    exp_directory = [
        f"{args.exp_name}-lr_{args.lr}-bs_{args.batch_size}",
        "",
        (f"-wd_{args.weight_decay}-opt_{args.optimizer}-ips_{args.iter_per_step}"
         ),
    ]
    if args.max_steps:
        exp_directory[1] = f"-s_{args.max_steps}"
    elif args.num_epochs:
        exp_directory[1] = f"-e_{args.num_epochs}"
    else:
        raise ValueError("Both max_steps and num_epochs were None.")
    return args, "".join(exp_directory)
Ejemplo n.º 8
0
def parse_args():
    parser = argparse.ArgumentParser(
        parents=[nm_argparse.NemoArgParser()], description='Tacotron2', conflict_handler='resolve',
    )
    parser.set_defaults(
        checkpoint_dir=None,
        optimizer="adam",
        batch_size=48,
        eval_batch_size=32,
        lr=0.001,
        amp_opt_level="O0",
        create_tb_writer=True,
        lr_policy=None,
        weight_decay=1e-6,
    )

    # Overwrite default args
    parser.add_argument("--max_steps", type=int, default=None, help="max number of steps to train")
    parser.add_argument("--num_epochs", type=int, default=None, help="number of epochs to train")
    parser.add_argument("--model_config", type=str, required=True, help="model configuration file: model.yaml")
    parser.add_argument("--grad_norm_clip", type=float, default=1.0, help="gradient clipping")
    parser.add_argument("--min_lr", type=float, default=1e-5, help="minimum learning rate to decay to")
    parser.add_argument(
        "--do_not_eval_at_start", action='store_true', help="toggle for whether to do evaluation on step 0"
    )
    parser.add_argument("--decoder_force", action='store_true', help="toggle for teacher forcing during evaluation")
    parser.add_argument("--random_seed", default=None, type=int, help="random seed for torch, numpy, and random")

    # Create new args
    parser.add_argument("--exp_name", default="Tacotron2", type=str)

    args = parser.parse_args()

    if args.lr_policy:
        raise NotImplementedError("Tacotron 2 does not support lr policy arg")
    if args.max_steps is not None and args.num_epochs is not None:
        raise ValueError("Either max_steps or num_epochs should be provided.")
    if args.eval_freq % 25 != 0:
        raise ValueError("eval_freq should be a multiple of 25.")

    exp_directory = [
        f"{args.exp_name}-lr_{args.lr}-bs_{args.batch_size}",
        "",
        f"-wd_{args.weight_decay}-opt_{args.optimizer}-ips_{args.iter_per_step}",
    ]
    if args.max_steps:
        exp_directory[1] = f"-s_{args.max_steps}"
    elif args.num_epochs:
        exp_directory[1] = f"-e_{args.num_epochs}"
    else:
        raise ValueError("Both max_steps and num_epochs were None.")
    return args, "".join(exp_directory)
Ejemplo n.º 9
0
def parse_args():
    parser = argparse.ArgumentParser(
        parents=[nm_argparse.NemoArgParser()],
        description="SpeakerRecognition",
        conflict_handler="resolve",
    )
    parser.set_defaults(
        checkpoint_dir=None,
        optimizer="novograd",
        batch_size=32,
        eval_batch_size=64,
        lr=0.01,
        weight_decay=0.001,
        amp_opt_level="O1",
        create_tb_writer=True,
    )

    # Overwrite default args
    parser.add_argument(
        "--num_epochs",
        type=int,
        default=None,
        required=True,
        help=
        "number of epochs to train. You should specify either num_epochs or max_steps",
    )
    parser.add_argument(
        "--model_config",
        type=str,
        required=True,
        help="model configuration file: model.yaml",
    )

    # Create new args
    parser.add_argument("--exp_name", default="SpkrReco_GramMatrix", type=str)
    parser.add_argument("--beta1", default=0.95, type=float)
    parser.add_argument("--beta2", default=0.5, type=float)
    parser.add_argument("--warmup_steps", default=1000, type=int)
    parser.add_argument("--load_dir", default=None, type=str)
    parser.add_argument("--synced_bn",
                        action="store_true",
                        help="Use synchronized batch norm")
    parser.add_argument("--emb_size", default=256, type=int)
    parser.add_argument("--synced_bn_groupsize", default=0, type=int)
    parser.add_argument("--print_freq", default=256, type=int)

    args = parser.parse_args()
    if args.max_steps is not None:
        raise ValueError("QuartzNet uses num_epochs instead of max_steps")

    return args
Ejemplo n.º 10
0
def parse_args():
    parser = argparse.ArgumentParser(
        description='TalkNet Mels Predictor Training Pipeline',
        parents=[nm_argparse.NemoArgParser()],
        conflict_handler='resolve',  # For parents common flags.
    )
    parser.add_argument('--eval_names', type=str, nargs="*", default=[], help="Eval datasets names.")
    parser.add_argument("--eval_datasets", type=str, nargs="*", default=[], help="Evaluation datasets paths.")
    parser.add_argument('--train_freq', type=int, default=300, help="Train metrics logging frequency.")
    parser.add_argument('--grad_norm_clip', type=float, help="grad norm clip")
    parser.add_argument('--warmup', type=int, default=3000, help="Number of steps for warmup.")
    parser.add_argument('--min_lr', type=float, default=1e-5, help="Minimum learning rate to decay to.")
    parser.add_argument('--wdb_project', type=str, help="WanDB run project")
    parser.add_argument('--wdb_name', type=str, help="WanDB run name")
    parser.add_argument('--wdb_tags', type=str, nargs="*", default=[], help="WanDB run tags")
    parser.set_defaults(
        amp_opt_level='O0',  # O1/O2 works notably faster, O3 usually produces NaNs.
        model_config='configs/talknet-mels-lj.yaml',
        batch_size=64,
        eval_batch_size=64,
        train_freq=10,
        eval_freq=100,  # 10x train freq
        optimizer='adam',
        weight_decay=1e-6,
        grad_norm_clip=1.0,
        warmup=3000,
        num_epochs=100,
        lr=1e-3,  # Goes good with Adam.
        min_lr=1e-5,  # Goes good with cosine policy.
        work_dir='work/' + str(datetime.datetime.now()).replace(' ', '_'),
        checkpoint_save_freq=10000,
        wdb_project='fast-tts',
        wdb_name='test_' + str(datetime.datetime.now()).replace(' ', '_'),
        wdb_tags=['mels', 'test', 'to-delete'],
    )

    # Required: train_dataset
    # Optional: eval_names, eval_datasets

    # Durations
    parser.add_argument('--train_durs', type=str, required=True, help="Train dataset durations directory path.")
    parser.add_argument('--eval_durs', type=str, nargs='*', default=[], help="Eval datasets durations")
    parser.add_argument('--durs_type', type=str, choices=['pad', 'full-pad'], default='full-pad', help="Durs type")

    args = parser.parse_args()

    return args
Ejemplo n.º 11
0
def parse_args():
    parser = argparse.ArgumentParser(
        parents=[nm_argparse.NemoArgParser()], description='ContextNet', conflict_handler='resolve',
    )
    parser.set_defaults(
        checkpoint_dir=None,
        optimizer="novograd",
        batch_size=32,
        eval_batch_size=64,
        lr=0.01,
        weight_decay=0.001,
        amp_opt_level="O0",
        create_tb_writer=True,
    )

    # Overwrite default args
    parser.add_argument(
        "--num_epochs",
        type=int,
        default=None,
        required=True,
        help="number of epochs to train. You should specify either num_epochs or max_steps",
    )
    parser.add_argument(
        "--model_config", type=str, required=True, help="model configuration file: model.yaml",
    )

    # Create new args
    parser.add_argument("--exp_name", default="ContextNet", type=str)
    parser.add_argument("--project", default=None, type=str)
    parser.add_argument("--beta1", default=0.95, type=float)
    parser.add_argument("--beta2", default=0.5, type=float)
    parser.add_argument("--warmup_steps", default=1000, type=int)
    parser.add_argument("--warmup_ratio", default=None, type=float)
    parser.add_argument('--min_lr', default=1e-5, type=float)
    parser.add_argument("--load_dir", default=None, type=str)
    parser.add_argument("--synced_bn", action='store_true', help="Use synchronized batch norm")
    parser.add_argument("--synced_bn_groupsize", default=0, type=int)
    parser.add_argument("--update_freq", default=50, type=int, help="Metrics update freq")
    parser.add_argument("--eval_freq", default=1000, type=int, help="Evaluation frequency")
    parser.add_argument('--kernel_size_factor', default=1.0, type=float)

    args = parser.parse_args()
    if args.max_steps is not None:
        raise ValueError("ContextNet uses num_epochs instead of max_steps")

    return args
Ejemplo n.º 12
0
def parse_args():
    parser = argparse.ArgumentParser(
        parents=[nm_argparse.NemoArgParser()],
        description="Jasper",
        conflict_handler="resolve",
    )
    parser.set_defaults(
        checkpoint_dir=None,
        optimizer="novograd",
        batch_size=64,
        eval_batch_size=64,
        lr=0.002,
        amp_opt_level="O1",
        create_tb_writer=True,
        model_config="./train/jasper10x5dr.yaml",
        work_dir="./train/work",
        num_epochs=300,
        weight_decay=0.005,
        checkpoint_save_freq=100,
        eval_freq=100,
        load_dir="./train/models/jasper/",
        warmup_steps=3,
        exp_name="jasper-speller",
    )

    # Overwrite default args
    parser.add_argument(
        "--max_steps",
        type=int,
        default=None,
        required=False,
        help="max number of steps to train",
    )
    parser.add_argument("--num_epochs",
                        type=int,
                        required=False,
                        help="number of epochs to train")
    parser.add_argument(
        "--model_config",
        type=str,
        required=False,
        help="model configuration file: model.yaml",
    )
    parser.add_argument(
        "--encoder_checkpoint",
        type=str,
        required=True,
        help="encoder checkpoint file: JasperEncoder.pt",
    )
    parser.add_argument(
        "--decoder_checkpoint",
        type=str,
        required=True,
        help="decoder checkpoint file: JasperDecoderForCTC.pt",
    )
    parser.add_argument(
        "--remote_data",
        type=str,
        required=False,
        default="",
        help="remote dataloader endpoint",
    )
    parser.add_argument(
        "--dataset",
        type=str,
        required=False,
        default="",
        help="dataset directory containing train/test manifests",
    )

    # Create new args
    parser.add_argument("--exp_name", default="Jasper", type=str)
    parser.add_argument("--beta1", default=0.95, type=float)
    parser.add_argument("--beta2", default=0.25, type=float)
    parser.add_argument("--warmup_steps", default=0, type=int)
    parser.add_argument(
        "--load_dir",
        default=None,
        type=str,
        help="directory with pre-trained checkpoint",
    )

    args = parser.parse_args()
    if args.max_steps is None and args.num_epochs is None:
        raise ValueError("Either max_steps or num_epochs should be provided.")
    return args
Ejemplo n.º 13
0
# limitations under the License.
# =============================================================================

import argparse

import nemo.utils.argparse as nm_argparse
from nemo.collections.cv.modules.data_layers import CIFAR100DataLayer
from nemo.collections.cv.modules.losses import NLLLoss
from nemo.collections.cv.modules.non_trainables import NonLinearity, ReshapeTensor
from nemo.collections.cv.modules.trainables import FeedForwardNetwork, ImageEncoder
from nemo.core import DeviceType, NeuralGraph, NeuralModuleFactory, OperationMode, SimpleLossLoggerCallback
from nemo.utils import logging

if __name__ == "__main__":
    # Create the default parser.
    parser = argparse.ArgumentParser(parents=[nm_argparse.NemoArgParser()],
                                     conflict_handler='resolve')
    # Parse the arguments
    args = parser.parse_args()

    # Instantiate Neural Factory.
    nf = NeuralModuleFactory(local_rank=args.local_rank,
                             placement=DeviceType.CPU)

    # Data layer - upscale the CIFAR100 images to ImageNet resolution.
    cifar100_dl = CIFAR100DataLayer(height=224, width=224, train=True)
    # The "model".
    image_encoder = ImageEncoder(model_type="vgg16",
                                 return_feature_maps=True,
                                 pretrained=True,
                                 name="vgg16")
Ejemplo n.º 14
0
def main():
    parser = argparse.ArgumentParser(parents=[nm_argparse.NemoArgParser()],
                                     description='AN4 ASR',
                                     conflict_handler='resolve')

    # Overwrite default args
    parser.add_argument("--train_dataset",
                        type=str,
                        help="training dataset path")
    parser.add_argument("--eval_datasets",
                        type=str,
                        nargs=1,
                        help="validation dataset path")

    # Create new args
    parser.add_argument("--lm", default="./an4-lm.3gram.binary", type=str)
    parser.add_argument("--test_after_training", action='store_true')
    parser.add_argument("--momentum", type=float)
    parser.add_argument("--beta1", default=0.95, type=float)
    parser.add_argument("--beta2", default=0.25, type=float)
    parser.set_defaults(
        model_config="./configs/jasper_an4.yaml",
        train_dataset="/home/mrjenkins/TestData/an4_dataset/an4_train.json",
        eval_datasets="/home/mrjenkins/TestData/an4_dataset/an4_val.json",
        work_dir="./tmp",
        checkpoint_dir="./tmp",
        optimizer="novograd",
        num_epochs=50,
        batch_size=32,
        eval_batch_size=16,
        lr=0.02,
        weight_decay=0.005,
        checkpoint_save_freq=1000,
        eval_freq=100,
        amp_opt_level="O1")

    args = parser.parse_args()
    betas = (args.beta1, args.beta2)

    wer_thr = 0.20
    beam_wer_thr = 0.15

    nf = nemo.core.NeuralModuleFactory(local_rank=args.local_rank,
                                       optimization_level=args.amp_opt_level,
                                       random_seed=0,
                                       log_dir=args.work_dir,
                                       checkpoint_dir=args.checkpoint_dir,
                                       create_tb_writer=True,
                                       cudnn_benchmark=args.cudnn_benchmark)
    tb_writer = nf.tb_writer
    checkpoint_dir = nf.checkpoint_dir
    args.checkpoint_dir = nf.checkpoint_dir

    # Load model definition
    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)

    vocab = jasper_params['labels']
    sample_rate = jasper_params['sample_rate']

    # build train and eval model
    train_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    train_dl_params.update(jasper_params["AudioToTextDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]

    data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.train_dataset,
        sample_rate=sample_rate,
        labels=vocab,
        batch_size=args.batch_size,
        **train_dl_params)

    num_samples = len(data_layer)
    total_steps = int(num_samples * args.num_epochs / args.batch_size)
    print("Train samples=", num_samples, "num_steps=", total_steps)

    data_preprocessor = nemo_asr.AudioPreprocessing(
        sample_rate=sample_rate, **jasper_params["AudioPreprocessing"])

    # data_augmentation = nemo_asr.SpectrogramAugmentation(
    #     **jasper_params['SpectrogramAugmentation']
    # )

    eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"])
    del eval_dl_params["train"]
    del eval_dl_params["eval"]

    data_layer_eval = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.eval_datasets,
        sample_rate=sample_rate,
        labels=vocab,
        batch_size=args.eval_batch_size,
        **eval_dl_params)

    num_samples = len(data_layer_eval)
    nf.logger.info(f"Eval samples={num_samples}")

    jasper_encoder = nemo_asr.JasperEncoder(
        feat_in=jasper_params["AudioPreprocessing"]["features"],
        **jasper_params["JasperEncoder"])

    jasper_decoder = nemo_asr.JasperDecoderForCTC(
        feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"],
        num_classes=len(vocab))

    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))

    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    # Training model
    audio, audio_len, transcript, transcript_len = data_layer()
    processed, processed_len = data_preprocessor(input_signal=audio,
                                                 length=audio_len)
    encoded, encoded_len = jasper_encoder(audio_signal=processed,
                                          length=processed_len)
    log_probs = jasper_decoder(encoder_output=encoded)
    predictions = greedy_decoder(log_probs=log_probs)
    loss = ctc_loss(log_probs=log_probs,
                    targets=transcript,
                    input_length=encoded_len,
                    target_length=transcript_len)

    # Evaluation model
    audio_e, audio_len_e, transcript_e, transcript_len_e = data_layer_eval()
    processed_e, processed_len_e = data_preprocessor(input_signal=audio_e,
                                                     length=audio_len_e)
    encoded_e, encoded_len_e = jasper_encoder(audio_signal=processed_e,
                                              length=processed_len_e)
    log_probs_e = jasper_decoder(encoder_output=encoded_e)
    predictions_e = greedy_decoder(log_probs=log_probs_e)
    loss_e = ctc_loss(log_probs=log_probs_e,
                      targets=transcript_e,
                      input_length=encoded_len_e,
                      target_length=transcript_len_e)
    nf.logger.info("Num of params in encoder: {0}".format(
        jasper_encoder.num_weights))

    # Callbacks to print info to console and Tensorboard
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[loss, predictions, transcript, transcript_len],
        print_func=lambda x: monitor_asr_train_progress(x, labels=vocab),
        get_tb_values=lambda x: [["loss", x[0]]],
        tb_writer=tb_writer,
    )

    checkpointer_callback = nemo.core.CheckpointCallback(
        folder=checkpoint_dir, step_freq=args.checkpoint_save_freq)

    eval_tensors = [loss_e, predictions_e, transcript_e, transcript_len_e]
    eval_callback = nemo.core.EvaluatorCallback(
        eval_tensors=eval_tensors,
        user_iter_callback=lambda x, y: process_evaluation_batch(
            x, y, labels=vocab),
        user_epochs_done_callback=process_evaluation_epoch,
        eval_step=args.eval_freq,
        tb_writer=tb_writer)

    nf.train(tensors_to_optimize=[loss],
             callbacks=[train_callback, eval_callback, checkpointer_callback],
             optimizer=args.optimizer,
             lr_policy=CosineAnnealing(total_steps=total_steps),
             optimization_params={
                 "num_epochs": args.num_epochs,
                 "max_steps": args.max_steps,
                 "lr": args.lr,
                 "momentum": args.momentum,
                 "betas": betas,
                 "weight_decay": args.weight_decay,
                 "grad_norm_clip": None
             },
             batches_per_step=args.iter_per_step)

    if args.test_after_training:
        # Create BeamSearch NM
        beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM(
            vocab=vocab,
            beam_width=64,
            alpha=2.,
            beta=1.5,
            lm_path=args.lm,
            num_cpus=max(os.cpu_count(), 1))
        beam_predictions = beam_search_with_lm(log_probs=log_probs_e,
                                               log_probs_length=encoded_len_e)
        eval_tensors.append(beam_predictions)

        evaluated_tensors = nf.infer(eval_tensors)
        greedy_hypotheses = post_process_predictions(evaluated_tensors[1],
                                                     vocab)
        references = post_process_transcripts(evaluated_tensors[2],
                                              evaluated_tensors[3], vocab)
        wer = word_error_rate(hypotheses=greedy_hypotheses,
                              references=references)
        nf.logger.info("Greedy WER: {:.2f}".format(wer * 100))
        assert wer <= wer_thr, (
            "Final eval greedy WER {:.2f}% > than {:.2f}%".format(
                wer * 100, wer_thr * 100))

        beam_hypotheses = []
        # Over mini-batch
        for i in evaluated_tensors[-1]:
            # Over samples
            for j in i:
                beam_hypotheses.append(j[0][1])

        beam_wer = word_error_rate(hypotheses=beam_hypotheses,
                                   references=references)
        nf.logger.info("Beam WER {:.2f}%".format(beam_wer * 100))
        assert beam_wer <= beam_wer_thr, (
            "Final eval beam WER {:.2f}%  > than {:.2f}%".format(
                beam_wer * 100, beam_wer_thr * 100))
        assert beam_wer <= wer, ("Final eval beam WER > than the greedy WER.")

        # Reload model weights and train for extra 10 epochs
        checkpointer_callback = nemo.core.CheckpointCallback(
            folder=checkpoint_dir,
            step_freq=args.checkpoint_save_freq,
            force_load=True)

        nf.reset_trainer()
        nf.train(tensors_to_optimize=[loss],
                 callbacks=[train_callback, checkpointer_callback],
                 optimizer=args.optimizer,
                 optimization_params={
                     "num_epochs": args.num_epochs + 10,
                     "lr": args.lr,
                     "momentum": args.momentum,
                     "betas": betas,
                     "weight_decay": args.weight_decay,
                     "grad_norm_clip": None
                 },
                 reset=True)

        evaluated_tensors = nf.infer(eval_tensors[:-1])
        greedy_hypotheses = post_process_predictions(evaluated_tensors[1],
                                                     vocab)
        references = post_process_transcripts(evaluated_tensors[2],
                                              evaluated_tensors[3], vocab)
        wer_new = word_error_rate(hypotheses=greedy_hypotheses,
                                  references=references)
        nf.logger.info("New greedy WER: {:.2f}%".format(wer_new * 100))
        assert wer_new <= wer * 1.1, (
            f"Fine tuning: new WER {wer * 100:.2f}% > than the previous WER "
            f"{wer_new * 100:.2f}%")
Ejemplo n.º 15
0
def main():
    parser = argparse.ArgumentParser(parents=[nm_argparse.NemoArgParser()],
                                     description='AN4 ASR',
                                     conflict_handler='resolve')

    # Overwrite default args
    parser.add_argument("--train_dataset",
                        type=str,
                        help="training dataset path")
    parser.add_argument("--eval_datasets",
                        type=str,
                        nargs=1,
                        help="validation dataset path")

    # Create new args
    parser.add_argument("--lm", default="./an4-lm.3gram.binary", type=str)
    parser.add_argument("--test_after_training", action='store_true')
    parser.add_argument("--momentum", type=float)
    parser.add_argument("--beta1", default=0.95, type=float)
    parser.add_argument("--beta2", default=0.25, type=float)
    parser.set_defaults(
        model_config="./configs/jasper_an4.yaml",
        train_dataset="/home/mrjenkins/TestData/an4_dataset/an4_train.json",
        eval_datasets="/home/mrjenkins/TestData/an4_dataset/an4_val.json",
        work_dir="./tmp",
        optimizer="novograd",
        num_epochs=50,
        batch_size=48,
        eval_batch_size=64,
        lr=0.02,
        weight_decay=0.005,
        checkpoint_save_freq=1000,
        eval_freq=100,
        amp_opt_level="O1")

    args = parser.parse_args()
    betas = (args.beta1, args.beta2)

    wer_thr = 0.20
    beam_wer_thr = 0.15

    nf = nemo.core.NeuralModuleFactory(local_rank=args.local_rank,
                                       files_to_copy=[__file__],
                                       optimization_level=args.amp_opt_level,
                                       random_seed=0,
                                       log_dir=args.work_dir,
                                       create_tb_writer=True,
                                       cudnn_benchmark=args.cudnn_benchmark)
    tb_writer = nf.tb_writer
    checkpoint_dir = nf.checkpoint_dir

    # Load model definition
    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)

    (loss, eval_tensors, callbacks, total_steps, vocab, log_probs_e,
     encoded_len_e) = create_dags(jasper_params, args, nf)

    nf.train(
        tensors_to_optimize=[loss],
        callbacks=callbacks,
        optimizer=args.optimizer,
        lr_policy=CosineAnnealing(total_steps=total_steps,
                                  min_lr=args.lr / 100),
        optimization_params={
            "num_epochs": args.num_epochs,
            "max_steps": args.max_steps,
            "lr": args.lr,
            "momentum": args.momentum,
            "betas": betas,
            "weight_decay": args.weight_decay,
            "grad_norm_clip": None
        },
        batches_per_step=args.iter_per_step,
        amp_max_loss_scale=256.,
        # synced_batchnorm=(nf.global_rank is not None),
    )

    if args.test_after_training:
        nemo.logging.info("Testing greedy and beam search with LM WER.")
        # Create BeamSearch NM
        if nf.world_size > 1:
            nemo.logging.warning("Skipping beam search WER as it does not "
                                 "work if doing distributed training.")
        else:
            beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM(
                vocab=vocab,
                beam_width=64,
                alpha=2.,
                beta=1.5,
                lm_path=args.lm,
                num_cpus=max(os.cpu_count(), 1))
            beam_predictions = beam_search_with_lm(
                log_probs=log_probs_e, log_probs_length=encoded_len_e)
            eval_tensors.append(beam_predictions)

        evaluated_tensors = nf.infer(eval_tensors)
        if nf.global_rank in [0, None]:
            greedy_hypotheses = post_process_predictions(
                evaluated_tensors[1], vocab)
            references = post_process_transcripts(evaluated_tensors[2],
                                                  evaluated_tensors[3], vocab)
            wer = word_error_rate(hypotheses=greedy_hypotheses,
                                  references=references)
            nemo.logging.info("Greedy WER: {:.2f}%".format(wer * 100))
            if wer > wer_thr:
                nf.sync_all_processes(False)
                raise ValueError(f"Final eval greedy WER {wer*100:.2f}% > :"
                                 f"than {wer_thr*100:.2f}%")
        nf.sync_all_processes()

        if nf.world_size == 1:
            beam_hypotheses = []
            # Over mini-batch
            for i in evaluated_tensors[-1]:
                # Over samples
                for j in i:
                    beam_hypotheses.append(j[0][1])

            beam_wer = word_error_rate(hypotheses=beam_hypotheses,
                                       references=references)
            nemo.logging.info("Beam WER {:.2f}%".format(beam_wer * 100))
            assert beam_wer <= beam_wer_thr, (
                "Final eval beam WER {:.2f}%  > than {:.2f}%".format(
                    beam_wer * 100, beam_wer_thr * 100))
            assert beam_wer <= wer, (
                "Final eval beam WER > than the greedy WER.")

        # Reload model weights and train for extra 10 epochs
        checkpointer_callback = nemo.core.CheckpointCallback(
            folder=checkpoint_dir,
            step_freq=args.checkpoint_save_freq,
            force_load=True)

        # Distributed Data Parallel changes the underlying class so we need
        # to reinstantiate Encoder and Decoder
        args.num_epochs += 10
        previous_step_count = total_steps
        loss, eval_tensors, callbacks, total_steps, vocab, _, _ = create_dags(
            jasper_params, args, nf)

        nf.reset_trainer()
        nf.train(
            tensors_to_optimize=[loss],
            callbacks=callbacks,
            optimizer=args.optimizer,
            lr_policy=CosineAnnealing(warmup_steps=previous_step_count,
                                      total_steps=total_steps),
            optimization_params={
                "num_epochs": args.num_epochs,
                "lr": args.lr / 100,
                "momentum": args.momentum,
                "betas": betas,
                "weight_decay": args.weight_decay,
                "grad_norm_clip": None
            },
            reset=True,
            amp_max_loss_scale=256.,
            # synced_batchnorm=(nf.global_rank is not None),
        )

        evaluated_tensors = nf.infer(eval_tensors)
        if nf.global_rank in [0, None]:
            greedy_hypotheses = post_process_predictions(
                evaluated_tensors[1], vocab)
            references = post_process_transcripts(evaluated_tensors[2],
                                                  evaluated_tensors[3], vocab)
            wer_new = word_error_rate(hypotheses=greedy_hypotheses,
                                      references=references)
            nemo.logging.info("New greedy WER: {:.2f}%".format(wer_new * 100))
            if wer_new > wer * 1.1:
                nf.sync_all_processes(False)
                raise ValueError(
                    f"Fine tuning: new WER {wer_new* 100:.2f}% > than the "
                    f"previous WER {wer * 100:.2f}%")
        nf.sync_all_processes()

        # Open the log file and ensure that epochs is strictly increasing
        if nf._exp_manager.log_file:
            epochs = []
            with open(nf._exp_manager.log_file, "r") as log_file:
                line = log_file.readline()
                while line:
                    index = line.find("Starting epoch")
                    if index != -1:
                        epochs.append(int(line[index +
                                               len("Starting epoch"):]))
                    line = log_file.readline()
            for i, e in enumerate(epochs):
                if i != e:
                    raise ValueError("Epochs from logfile was not understood")