Exemple #1
0
def test(argv):
    parser = argparse.ArgumentParser(description="DeepSpeech AM testing")
    # for testing
    parser.add_argument('--data-path', default='data/swbd', type=str, help="dataset path to use in training")
    parser.add_argument('--min-len', default=1., type=float, help="min length of utterance to use in secs")
    parser.add_argument('--max-len', default=20., type=float, help="max length of utterance to use in secs")
    parser.add_argument('--num-workers', default=0, type=int, help="number of dataloader workers")
    parser.add_argument('--batch-size', default=4, type=int, help="number of images (and labels) to be considered in a batch")
    # optional
    parser.add_argument('--use-cuda', default=False, action='store_true', help="use cuda")
    parser.add_argument('--fp16', default=False, action='store_true', help="use FP16 model")
    parser.add_argument('--log-dir', default='./logs_deepspeech_ctc', type=str, help="filename for logging the outputs")
    parser.add_argument('--continue-from', default=None, type=str, help="model file path to make continued from")
    args = parser.parse_args(argv)

    init_logger(log_file="test.log", **vars(args))

    assert args.continue_from is not None

    model = DeepSpeech(num_classes=p.NUM_CTC_LABELS)
    trainer = NonSplitTrainer(model, **vars(args))
    labeler = trainer.decoder.labeler

    manifest = f"{args.data_path}/eval2000.csv"
    dataset = AudioSubset(NonSplitTrainDataset(labeler=labeler, manifest_file=manifest),
                          max_len=args.max_len, min_len=args.min_len)
    dataloader = NonSplitTrainDataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_workers,
                                         shuffle=True, pin_memory=args.use_cuda)

    trainer.test(dataloader)
Exemple #2
0
def train(argv):
    parser = argparse.ArgumentParser(description="ResNet AM with fully supervised training")
    # for training
    parser.add_argument('--data-path', default='data/aspire', type=str, help="dataset path to use in training")
    parser.add_argument('--min-len', default=1., type=float, help="min length of utterance to use in secs")
    parser.add_argument('--max-len', default=10., type=float, help="max length of utterance to use in secs")
    parser.add_argument('--batch-size', default=8, type=int, help="number of images (and labels) to be considered in a batch")
    parser.add_argument('--num-workers', default=8, type=int, help="number of dataloader workers")
    parser.add_argument('--num-epochs', default=100, type=int, help="number of epochs to run")
    parser.add_argument('--init-lr', default=1e-4, type=float, help="initial learning rate for Adam optimizer")
    parser.add_argument('--max-norm', default=400, type=int, help="norm cutoff to prevent explosion of gradients")
    # optional
    parser.add_argument('--use-cuda', default=False, action='store_true', help="use cuda")
    parser.add_argument('--visdom', default=False, action='store_true', help="use visdom logging")
    parser.add_argument('--visdom-host', default="127.0.0.1", type=str, help="visdom server ip address")
    parser.add_argument('--visdom-port', default=8097, type=int, help="visdom server port")
    parser.add_argument('--tensorboard', default=False, action='store_true', help="use tensorboard logging")
    parser.add_argument('--seed', default=None, type=int, help="seed for controlling randomness in this example")
    parser.add_argument('--log-dir', default='./logs_resnet_ctc', type=str, help="filename for logging the outputs")
    parser.add_argument('--model-prefix', default='resnet_ctc', type=str, help="model file prefix to store")
    parser.add_argument('--checkpoint', default=True, action='store_true', help="save checkpoint")
    parser.add_argument('--continue-from', default=None, type=str, help="model file path to make continued from")
    parser.add_argument('--opt-type', default="sgdr", type=str, help=f"optimizer type in {OPTIMIZER_TYPES}")

    args = parser.parse_args(argv)

    set_logfile(Path(args.log_dir, "train.log"))
    version_log(args)
    set_seed(args.seed)

    # prepare trainer object
    model = resnet101(num_classes=p.NUM_CTC_LABELS)
    trainer = NonSplitTrainer(model=model, **vars(args))
    labeler = trainer.decoder.labeler

    data_opts = {
        "train" : (f"{args.data_path}/train.csv", 0),
        "dev"   : (f"{args.data_path}/dev.csv", 0),
        "test"  : (f"{args.data_path}/test.csv", 0),
    }
    datasets, dataloaders = dict(), dict()
    for k, (v) in data_opts.items():
        manifest_file, data_size = v
        datasets[k] = AudioSubset(NonSplitTrainDataset(labeler=labeler, manifest_file=manifest_file),
                                  data_size=data_size, min_len=args.min_len, max_len=args.max_len)
        dataloaders[k] = NonSplitTrainDataLoader(datasets[k], batch_size=args.batch_size,
                                                 num_workers=args.num_workers, shuffle=True,
                                                 pin_memory=args.use_cuda)

    # run inference for a certain number of epochs
    for i in range(trainer.epoch, args.num_epochs):
        trainer.train_epoch(dataloaders["train"])
        trainer.validate(dataloaders["dev"])

    # final test to know WER
    trainer.test(dataloaders["test"])
Exemple #3
0
def test(argv):
    parser = argparse.ArgumentParser(description="ListenAttendSpell AM testing")
    # for testing
    parser.add_argument('--data-path', default='data/swbd', type=str, help="dataset path to use in training")
    parser.add_argument('--min-len', default=1., type=float, help="min length of utterance to use in secs")
    parser.add_argument('--max-len', default=20., type=float, help="max length of utterance to use in secs")
    parser.add_argument('--num-workers', default=0, type=int, help="number of dataloader workers")
    parser.add_argument('--batch-size', default=4, type=int, help="number of images (and labels) to be considered in a batch")
    # optional
    parser.add_argument('--use-cuda', default=False, action='store_true', help="use cuda")
    parser.add_argument('--fp16', default=False, action='store_true', help="use FP16 model")
    parser.add_argument('--log-dir', default='./logs_las', type=str, help="filename for logging the outputs")
    parser.add_argument('--continue-from', default=None, type=str, help="model file path to make continued from")
    parser.add_argument('--validate', default=False, action='store_true', help="test LER instead of WER")
    args = parser.parse_args(argv)

    init_logger(log_file="test.log", **vars(args))

    assert args.continue_from is not None

    input_folding = 3
    model = ListenAttendSpell(label_vec_size=p.NUM_CTC_LABELS, input_folding=input_folding)

    amp_handle = get_amp_handle(args)
    trainer = LASTrainer(model, amp_handle, **vars(args))
    labeler = trainer.decoder.labeler

    if args.validate:
        manifest = f"{args.data_path}/eval2000.csv"
    else:
        manifest = f"{args.data_path}/rt03.csv"

    dataset = AudioSubset(NonSplitTrainDataset(labeler=labeler, manifest_file=manifest, stride=input_folding),
                          max_len=args.max_len, min_len=args.min_len)
    dataloader = NonSplitTrainDataLoader(dataset, sort=True, batch_size=args.batch_size, num_workers=args.num_workers,
                                         shuffle=True, pin_memory=args.use_cuda)

    if args.validate:
        trainer.validate(dataloader)
    else:
        trainer.test(dataloader)
Exemple #4
0
def batch_train(argv):
    parser = argparse.ArgumentParser(description="ListenAttendSpell AM with batch training")
    # for training
    parser.add_argument('--data-path', default='/d1/jbaik/ics-asr/data', type=str, help="dataset path to use in training")
    parser.add_argument('--num-epochs', default=200, type=int, help="number of epochs to run")
    parser.add_argument('--init-lr', default=0.01, type=float, help="initial learning rate for Adam optimizer")
    parser.add_argument('--max-norm', default=0.5, type=int, help="norm cutoff to prevent explosion of gradients")
    # optional
    parser.add_argument('--use-cuda', default=False, action='store_true', help="use cuda")
    parser.add_argument('--fp16', default=False, action='store_true', help="use FP16 model")
    parser.add_argument('--visdom', default=False, action='store_true', help="use visdom logging")
    parser.add_argument('--visdom-host', default="127.0.0.1", type=str, help="visdom server ip address")
    parser.add_argument('--visdom-port', default=8097, type=int, help="visdom server port")
    parser.add_argument('--tensorboard', default=False, action='store_true', help="use tensorboard logging")
    parser.add_argument('--slack', default=False, action='store_true', help="use slackclient logging (need to set SLACK_API_TOKEN and SLACK_API_USER env_var")
    parser.add_argument('--seed', default=None, type=int, help="seed for controlling randomness in this example")
    parser.add_argument('--log-dir', default='./logs_las', type=str, help="filename for logging the outputs")
    parser.add_argument('--model-prefix', default='las', type=str, help="model file prefix to store")
    parser.add_argument('--checkpoint', default=False, action='store_true', help="save checkpoint")
    parser.add_argument('--continue-from', default=None, type=str, help="model file path to make continued from")
    parser.add_argument('--opt-type', default="sgdr", type=str, help=f"optimizer type in {OPTIMIZER_TYPES}")
    args = parser.parse_args(argv)

    init_distributed(args.use_cuda)
    init_logger(log_file="train.log", rank=get_rank(), **vars(args))
    set_seed(args.seed)

    # prepare trainer object
    input_folding = 3
    model = ListenAttendSpell(label_vec_size=p.NUM_CTC_LABELS, input_folding=input_folding)

    amp_handle = get_amp_handle(args)
    trainer = LASTrainer(model, amp_handle, **vars(args))
    labeler = trainer.decoder.labeler

    train_datasets = [
        NonSplitTrainDataset(labeler=labeler, manifest_file=f"{args.data_path}/aspire/train.csv", stride=input_folding),
        NonSplitTrainDataset(labeler=labeler, manifest_file=f"{args.data_path}/aspire/dev.csv", stride=input_folding),
        NonSplitTrainDataset(labeler=labeler, manifest_file=f"{args.data_path}/aspire/test.csv", stride=input_folding),
        NonSplitTrainDataset(labeler=labeler, manifest_file=f"{args.data_path}/swbd/train.csv", stride=input_folding),
    ]

    datasets = {
        "train3" : ConcatDataset([AudioSubset(d, max_len=3) for d in train_datasets]),
        "train5" : ConcatDataset([AudioSubset(d, max_len=5) for d in train_datasets]),
        "train10": ConcatDataset([AudioSubset(d, max_len=10) for d in train_datasets]),
        "train15": ConcatDataset([AudioSubset(d, max_len=15) for d in train_datasets]),
        "dev"    : NonSplitTrainDataset(labeler=labeler, manifest_file=f"{args.data_path}/swbd/eval2000.csv", stride=input_folding),
        "test"   : NonSplitTrainDataset(labeler=labeler, manifest_file=f"{args.data_path}/swbd/rt03.csv", stride=input_folding),
    }

    dataloaders = {
        "train3" : NonSplitTrainDataLoader(datasets["train3"],
                                           sampler=(DistributedSampler(datasets["train3"])
                                                    if is_distributed() else None),
                                           batch_size=64, num_workers=32,
                                           shuffle=(not is_distributed()),
                                           pin_memory=args.use_cuda),
        "train5" : NonSplitTrainDataLoader(datasets["train5"],
                                           sampler=(DistributedSampler(datasets["train5"])
                                                    if is_distributed() else None),
                                           batch_size=64, num_workers=32,
                                           shuffle=(not is_distributed()),
                                           pin_memory=args.use_cuda),
        "train10": NonSplitTrainDataLoader(datasets["train10"],
                                           sampler=(DistributedSampler(datasets["train10"])
                                                    if is_distributed() else None),
                                           batch_size=64, num_workers=32,
                                           shuffle=(not is_distributed()),
                                           pin_memory=args.use_cuda),
        "train15": NonSplitTrainDataLoader(datasets["train15"],
                                           sampler=(DistributedSampler(datasets["train15"])
                                                    if is_distributed() else None),
                                           batch_size=64, num_workers=32,
                                           shuffle=(not is_distributed()),
                                           pin_memory=args.use_cuda),
        "dev"    : NonSplitTrainDataLoader(datasets["dev"],
                                           batch_size=32, num_workers=16,
                                           shuffle=False, pin_memory=args.use_cuda),
        "test"   : NonSplitTrainDataLoader(datasets["test"],
                                           batch_size=32, num_workers=16,
                                           shuffle=False, pin_memory=args.use_cuda),
    }

    # run inference for a certain number of epochs
    for i in range(trainer.epoch, args.num_epochs):
        if i < 2:
            trainer.train_epoch(dataloaders["train3"])
            trainer.validate(dataloaders["dev"])
        elif i < (2 + 4 + 8):
            trainer.train_epoch(dataloaders["train5"])
            trainer.validate(dataloaders["dev"])
        elif i < (2 + 4 + 8 + 16):
            trainer.train_epoch(dataloaders["train10"])
            trainer.validate(dataloaders["dev"])
        else:
            trainer.train_epoch(dataloaders["train15"])
            trainer.validate(dataloaders["dev"])

    # final test to know WER
    trainer.test(dataloaders["test"])
Exemple #5
0
def train(argv):
    parser = argparse.ArgumentParser(description="DeepSpeech AM with fully supervised training")
    # for training
    parser.add_argument('--data-path', default='/d1/jbaik/ics-asr/data', type=str, help="dataset path to use in training")
    parser.add_argument('--min-len', default=1., type=float, help="min length of utterance to use in secs")
    parser.add_argument('--max-len', default=15., type=float, help="max length of utterance to use in secs")
    parser.add_argument('--batch-size', default=32, type=int, help="number of images (and labels) to be considered in a batch")
    parser.add_argument('--num-workers', default=32, type=int, help="number of dataloader workers")
    parser.add_argument('--num-epochs', default=100, type=int, help="number of epochs to run")
    parser.add_argument('--init-lr', default=0.01, type=float, help="initial learning rate for Adam optimizer")
    parser.add_argument('--max-norm', default=400, type=int, help="norm cutoff to prevent explosion of gradients")
    # optional
    parser.add_argument('--use-cuda', default=False, action='store_true', help="use cuda")
    parser.add_argument('--fp16', default=False, action='store_true', help="use FP16 model")
    parser.add_argument('--visdom', default=False, action='store_true', help="use visdom logging")
    parser.add_argument('--visdom-host', default="127.0.0.1", type=str, help="visdom server ip address")
    parser.add_argument('--visdom-port', default=8097, type=int, help="visdom server port")
    parser.add_argument('--tensorboard', default=False, action='store_true', help="use tensorboard logging")
    parser.add_argument('--slack', default=False, action='store_true', help="use slackclient logging (need to set SLACK_API_TOKEN and SLACK_API_USER env_var")
    parser.add_argument('--seed', default=None, type=int, help="seed for controlling randomness in this example")
    parser.add_argument('--log-dir', default='./logs_deepspeech_ctc', type=str, help="filename for logging the outputs")
    parser.add_argument('--model-prefix', default='deepspeech_ctc', type=str, help="model file prefix to store")
    parser.add_argument('--checkpoint', default=True, action='store_true', help="save checkpoint")
    parser.add_argument('--continue-from', default=None, type=str, help="model file path to make continued from")
    parser.add_argument('--opt-type', default="sgdr", type=str, help=f"optimizer type in {OPTIMIZER_TYPES}")
    args = parser.parse_args(argv)

    init_distributed(args.use_cuda)
    init_logger(log_file="train.log", rank=get_rank(), **vars(args))
    set_seed(args.seed)

    # prepare trainer object
    model = DeepSpeech(num_classes=p.NUM_CTC_LABELS)
    trainer = NonSplitTrainer(model=model, **vars(args))
    labeler = trainer.decoder.labeler

    train_datasets = [
        NonSplitTrainDataset(labeler=labeler, manifest_file=f"{args.data_path}/aspire/train.csv"),
        NonSplitTrainDataset(labeler=labeler, manifest_file=f"{args.data_path}/aspire/dev.csv"),
        NonSplitTrainDataset(labeler=labeler, manifest_file=f"{args.data_path}/aspire/test.csv"),
        NonSplitTrainDataset(labeler=labeler, manifest_file=f"{args.data_path}/swbd/train.csv"),
    ]

    datasets = {
        "train": ConcatDataset([AudioSubset(d, data_size=0, min_len=args.min_len, max_len=args.max_len)
                                for d in train_datasets]),
        "dev"  : NonSplitTrainDataset(labeler=labeler, manifest_file=f"{args.data_path}/swbd/eval2000.csv"),
        "test" : NonSplitTrainDataset(labeler=labeler, manifest_file=f"{args.data_path}/swbd/rt03.csv"),
    }

    dataloaders = {
        "train": NonSplitTrainDataLoader(datasets["train"],
                                         sampler=(DistributedSampler(datasets["train"])
                                                  if is_distributed() else None),
                                         batch_size=args.batch_size,
                                         num_workers=args.num_workers,
                                         shuffle=(not is_distributed()),
                                         pin_memory=args.use_cuda),
        "dev"  : NonSplitTrainDataLoader(datasets["dev"],
                                         batch_size=16, num_workers=8,
                                         shuffle=False, pin_memory=args.use_cuda),
        "test" : NonSplitTrainDataLoader(datasets["test"],
                                         batch_size=16, num_workers=8,
                                         shuffle=False, pin_memory=args.use_cuda),
    }

    # run inference for a certain number of epochs
    for i in range(trainer.epoch, args.num_epochs):
        trainer.train_epoch(dataloaders["train"])
        trainer.validate(dataloaders["dev"])

    # final test to know WER
    trainer.test(dataloaders["test"])
Exemple #6
0
def batch_train(argv):
    parser = argparse.ArgumentParser(
        description="ResNet AM with batch training")
    # for training
    parser.add_argument('--num-epochs',
                        default=100,
                        type=int,
                        help="number of epochs to run")
    parser.add_argument('--init-lr',
                        default=1e-4,
                        type=float,
                        help="initial learning rate for Adam optimizer")
    parser.add_argument('--max-norm',
                        default=400,
                        type=int,
                        help="norm cutoff to prevent explosion of gradients")
    # optional
    parser.add_argument('--use-cuda',
                        default=False,
                        action='store_true',
                        help="use cuda")
    parser.add_argument('--visdom',
                        default=False,
                        action='store_true',
                        help="use visdom logging")
    parser.add_argument('--visdom-host',
                        default="127.0.0.1",
                        type=str,
                        help="visdom server ip address")
    parser.add_argument('--visdom-port',
                        default=8097,
                        type=int,
                        help="visdom server port")
    parser.add_argument('--tensorboard',
                        default=False,
                        action='store_true',
                        help="use tensorboard logging")
    parser.add_argument('--seed',
                        default=None,
                        type=int,
                        help="seed for controlling randomness in this example")
    parser.add_argument('--log-dir',
                        default='./logs_resnet_split',
                        type=str,
                        help="filename for logging the outputs")
    parser.add_argument('--model-prefix',
                        default='resnet_split',
                        type=str,
                        help="model file prefix to store")
    parser.add_argument('--checkpoint',
                        default=True,
                        action='store_true',
                        help="save checkpoint")
    parser.add_argument('--continue-from',
                        default=None,
                        type=str,
                        help="model file path to make continued from")
    parser.add_argument('--opt-type',
                        default="sgdr",
                        type=str,
                        help=f"optimizer type in {OPTIMIZER_TYPES}")

    args = parser.parse_args(argv)

    set_logfile(Path(args.log_dir, "train.log"))
    version_log(args)
    set_seed(args.seed)

    # prepare trainer object
    model = resnet50(num_classes=p.NUM_CTC_LABELS)
    trainer = SplitTrainer(model, **vars(args))
    labeler = trainer.decoder.labeler

    train_datasets = [
        SplitTrainDataset(labeler=labeler,
                          manifest_file="data/aspire/train.csv"),
        SplitTrainDataset(labeler=labeler,
                          manifest_file="data/aspire/dev.csv"),
        SplitTrainDataset(labeler=labeler,
                          manifest_file="data/aspire/test.csv"),
        SplitTrainDataset(labeler=labeler,
                          manifest_file="data/swbd/train.csv"),
    ]

    datasets = {
        "train5":
        ConcatDataset([AudioSubset(d, max_len=5) for d in train_datasets]),
        "train10":
        ConcatDataset([AudioSubset(d, max_len=10) for d in train_datasets]),
        "train15":
        ConcatDataset([AudioSubset(d, max_len=15) for d in train_datasets]),
        "dev":
        SplitTrainDataset(labeler=labeler,
                          manifest_file="data/swbd/eval2000.csv"),
        "test":
        SplitTrainDataset(labeler=labeler, manifest_file="data/swbd/rt03.csv"),
    }
    dataloaders = {
        "train5":
        SplitTrainDataLoader(datasets["train5"],
                             batch_size=1,
                             num_workers=0,
                             shuffle=True,
                             pin_memory=args.use_cuda),
        "train10":
        SplitTrainDataLoader(datasets["train10"],
                             batch_size=1,
                             num_workers=0,
                             shuffle=True,
                             pin_memory=args.use_cuda),
        "train15":
        SplitTrainDataLoader(datasets["train15"],
                             batch_size=1,
                             num_workers=0,
                             shuffle=True,
                             pin_memory=args.use_cuda),
        "dev":
        SplitTrainDataLoader(datasets["dev"],
                             batch_size=1,
                             num_workers=0,
                             shuffle=False,
                             pin_memory=args.use_cuda),
        "test":
        SplitTrainDataLoader(datasets["test"],
                             batch_size=1,
                             num_workers=0,
                             shuffle=True,
                             pin_memory=args.use_cuda),
    }

    # run inference for a certain number of epochs
    for i in range(trainer.epoch, args.num_epochs):
        if i < 10:
            trainer.train_epoch(dataloaders["train5"])
            trainer.validate(dataloaders["dev"])
        elif i < 30:
            trainer.train_epoch(dataloaders["train10"])
            trainer.validate(dataloaders["dev"])
        else:
            trainer.train_epoch(dataloaders["train15"])
            trainer.validate(dataloaders["dev"])

    # final test to know WER
    trainer.test(dataloaders["test"])