def test(argv): parser = argparse.ArgumentParser(description="DeepSpeech AM testing") # for testing parser.add_argument('--data-path', default='data/swbd', type=str, help="dataset path to use in training") parser.add_argument('--min-len', default=1., type=float, help="min length of utterance to use in secs") parser.add_argument('--max-len', default=20., type=float, help="max length of utterance to use in secs") parser.add_argument('--num-workers', default=0, type=int, help="number of dataloader workers") parser.add_argument('--batch-size', default=4, type=int, help="number of images (and labels) to be considered in a batch") # optional parser.add_argument('--use-cuda', default=False, action='store_true', help="use cuda") parser.add_argument('--fp16', default=False, action='store_true', help="use FP16 model") parser.add_argument('--log-dir', default='./logs_deepspeech_ctc', type=str, help="filename for logging the outputs") parser.add_argument('--continue-from', default=None, type=str, help="model file path to make continued from") args = parser.parse_args(argv) init_logger(log_file="test.log", **vars(args)) assert args.continue_from is not None model = DeepSpeech(num_classes=p.NUM_CTC_LABELS) trainer = NonSplitTrainer(model, **vars(args)) labeler = trainer.decoder.labeler manifest = f"{args.data_path}/eval2000.csv" dataset = AudioSubset(NonSplitTrainDataset(labeler=labeler, manifest_file=manifest), max_len=args.max_len, min_len=args.min_len) dataloader = NonSplitTrainDataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True, pin_memory=args.use_cuda) trainer.test(dataloader)
def train(argv): parser = argparse.ArgumentParser(description="ResNet AM with fully supervised training") # for training parser.add_argument('--data-path', default='data/aspire', type=str, help="dataset path to use in training") parser.add_argument('--min-len', default=1., type=float, help="min length of utterance to use in secs") parser.add_argument('--max-len', default=10., type=float, help="max length of utterance to use in secs") parser.add_argument('--batch-size', default=8, type=int, help="number of images (and labels) to be considered in a batch") parser.add_argument('--num-workers', default=8, type=int, help="number of dataloader workers") parser.add_argument('--num-epochs', default=100, type=int, help="number of epochs to run") parser.add_argument('--init-lr', default=1e-4, type=float, help="initial learning rate for Adam optimizer") parser.add_argument('--max-norm', default=400, type=int, help="norm cutoff to prevent explosion of gradients") # optional parser.add_argument('--use-cuda', default=False, action='store_true', help="use cuda") parser.add_argument('--visdom', default=False, action='store_true', help="use visdom logging") parser.add_argument('--visdom-host', default="127.0.0.1", type=str, help="visdom server ip address") parser.add_argument('--visdom-port', default=8097, type=int, help="visdom server port") parser.add_argument('--tensorboard', default=False, action='store_true', help="use tensorboard logging") parser.add_argument('--seed', default=None, type=int, help="seed for controlling randomness in this example") parser.add_argument('--log-dir', default='./logs_resnet_ctc', type=str, help="filename for logging the outputs") parser.add_argument('--model-prefix', default='resnet_ctc', type=str, help="model file prefix to store") parser.add_argument('--checkpoint', default=True, action='store_true', help="save checkpoint") parser.add_argument('--continue-from', default=None, type=str, help="model file path to make continued from") parser.add_argument('--opt-type', default="sgdr", type=str, help=f"optimizer type in {OPTIMIZER_TYPES}") args = parser.parse_args(argv) set_logfile(Path(args.log_dir, "train.log")) version_log(args) set_seed(args.seed) # prepare trainer object model = resnet101(num_classes=p.NUM_CTC_LABELS) trainer = NonSplitTrainer(model=model, **vars(args)) labeler = trainer.decoder.labeler data_opts = { "train" : (f"{args.data_path}/train.csv", 0), "dev" : (f"{args.data_path}/dev.csv", 0), "test" : (f"{args.data_path}/test.csv", 0), } datasets, dataloaders = dict(), dict() for k, (v) in data_opts.items(): manifest_file, data_size = v datasets[k] = AudioSubset(NonSplitTrainDataset(labeler=labeler, manifest_file=manifest_file), data_size=data_size, min_len=args.min_len, max_len=args.max_len) dataloaders[k] = NonSplitTrainDataLoader(datasets[k], batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True, pin_memory=args.use_cuda) # run inference for a certain number of epochs for i in range(trainer.epoch, args.num_epochs): trainer.train_epoch(dataloaders["train"]) trainer.validate(dataloaders["dev"]) # final test to know WER trainer.test(dataloaders["test"])
def test(argv): parser = argparse.ArgumentParser(description="ListenAttendSpell AM testing") # for testing parser.add_argument('--data-path', default='data/swbd', type=str, help="dataset path to use in training") parser.add_argument('--min-len', default=1., type=float, help="min length of utterance to use in secs") parser.add_argument('--max-len', default=20., type=float, help="max length of utterance to use in secs") parser.add_argument('--num-workers', default=0, type=int, help="number of dataloader workers") parser.add_argument('--batch-size', default=4, type=int, help="number of images (and labels) to be considered in a batch") # optional parser.add_argument('--use-cuda', default=False, action='store_true', help="use cuda") parser.add_argument('--fp16', default=False, action='store_true', help="use FP16 model") parser.add_argument('--log-dir', default='./logs_las', type=str, help="filename for logging the outputs") parser.add_argument('--continue-from', default=None, type=str, help="model file path to make continued from") parser.add_argument('--validate', default=False, action='store_true', help="test LER instead of WER") args = parser.parse_args(argv) init_logger(log_file="test.log", **vars(args)) assert args.continue_from is not None input_folding = 3 model = ListenAttendSpell(label_vec_size=p.NUM_CTC_LABELS, input_folding=input_folding) amp_handle = get_amp_handle(args) trainer = LASTrainer(model, amp_handle, **vars(args)) labeler = trainer.decoder.labeler if args.validate: manifest = f"{args.data_path}/eval2000.csv" else: manifest = f"{args.data_path}/rt03.csv" dataset = AudioSubset(NonSplitTrainDataset(labeler=labeler, manifest_file=manifest, stride=input_folding), max_len=args.max_len, min_len=args.min_len) dataloader = NonSplitTrainDataLoader(dataset, sort=True, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True, pin_memory=args.use_cuda) if args.validate: trainer.validate(dataloader) else: trainer.test(dataloader)
def batch_train(argv): parser = argparse.ArgumentParser(description="ListenAttendSpell AM with batch training") # for training parser.add_argument('--data-path', default='/d1/jbaik/ics-asr/data', type=str, help="dataset path to use in training") parser.add_argument('--num-epochs', default=200, type=int, help="number of epochs to run") parser.add_argument('--init-lr', default=0.01, type=float, help="initial learning rate for Adam optimizer") parser.add_argument('--max-norm', default=0.5, type=int, help="norm cutoff to prevent explosion of gradients") # optional parser.add_argument('--use-cuda', default=False, action='store_true', help="use cuda") parser.add_argument('--fp16', default=False, action='store_true', help="use FP16 model") parser.add_argument('--visdom', default=False, action='store_true', help="use visdom logging") parser.add_argument('--visdom-host', default="127.0.0.1", type=str, help="visdom server ip address") parser.add_argument('--visdom-port', default=8097, type=int, help="visdom server port") parser.add_argument('--tensorboard', default=False, action='store_true', help="use tensorboard logging") parser.add_argument('--slack', default=False, action='store_true', help="use slackclient logging (need to set SLACK_API_TOKEN and SLACK_API_USER env_var") parser.add_argument('--seed', default=None, type=int, help="seed for controlling randomness in this example") parser.add_argument('--log-dir', default='./logs_las', type=str, help="filename for logging the outputs") parser.add_argument('--model-prefix', default='las', type=str, help="model file prefix to store") parser.add_argument('--checkpoint', default=False, action='store_true', help="save checkpoint") parser.add_argument('--continue-from', default=None, type=str, help="model file path to make continued from") parser.add_argument('--opt-type', default="sgdr", type=str, help=f"optimizer type in {OPTIMIZER_TYPES}") args = parser.parse_args(argv) init_distributed(args.use_cuda) init_logger(log_file="train.log", rank=get_rank(), **vars(args)) set_seed(args.seed) # prepare trainer object input_folding = 3 model = ListenAttendSpell(label_vec_size=p.NUM_CTC_LABELS, input_folding=input_folding) amp_handle = get_amp_handle(args) trainer = LASTrainer(model, amp_handle, **vars(args)) labeler = trainer.decoder.labeler train_datasets = [ NonSplitTrainDataset(labeler=labeler, manifest_file=f"{args.data_path}/aspire/train.csv", stride=input_folding), NonSplitTrainDataset(labeler=labeler, manifest_file=f"{args.data_path}/aspire/dev.csv", stride=input_folding), NonSplitTrainDataset(labeler=labeler, manifest_file=f"{args.data_path}/aspire/test.csv", stride=input_folding), NonSplitTrainDataset(labeler=labeler, manifest_file=f"{args.data_path}/swbd/train.csv", stride=input_folding), ] datasets = { "train3" : ConcatDataset([AudioSubset(d, max_len=3) for d in train_datasets]), "train5" : ConcatDataset([AudioSubset(d, max_len=5) for d in train_datasets]), "train10": ConcatDataset([AudioSubset(d, max_len=10) for d in train_datasets]), "train15": ConcatDataset([AudioSubset(d, max_len=15) for d in train_datasets]), "dev" : NonSplitTrainDataset(labeler=labeler, manifest_file=f"{args.data_path}/swbd/eval2000.csv", stride=input_folding), "test" : NonSplitTrainDataset(labeler=labeler, manifest_file=f"{args.data_path}/swbd/rt03.csv", stride=input_folding), } dataloaders = { "train3" : NonSplitTrainDataLoader(datasets["train3"], sampler=(DistributedSampler(datasets["train3"]) if is_distributed() else None), batch_size=64, num_workers=32, shuffle=(not is_distributed()), pin_memory=args.use_cuda), "train5" : NonSplitTrainDataLoader(datasets["train5"], sampler=(DistributedSampler(datasets["train5"]) if is_distributed() else None), batch_size=64, num_workers=32, shuffle=(not is_distributed()), pin_memory=args.use_cuda), "train10": NonSplitTrainDataLoader(datasets["train10"], sampler=(DistributedSampler(datasets["train10"]) if is_distributed() else None), batch_size=64, num_workers=32, shuffle=(not is_distributed()), pin_memory=args.use_cuda), "train15": NonSplitTrainDataLoader(datasets["train15"], sampler=(DistributedSampler(datasets["train15"]) if is_distributed() else None), batch_size=64, num_workers=32, shuffle=(not is_distributed()), pin_memory=args.use_cuda), "dev" : NonSplitTrainDataLoader(datasets["dev"], batch_size=32, num_workers=16, shuffle=False, pin_memory=args.use_cuda), "test" : NonSplitTrainDataLoader(datasets["test"], batch_size=32, num_workers=16, shuffle=False, pin_memory=args.use_cuda), } # run inference for a certain number of epochs for i in range(trainer.epoch, args.num_epochs): if i < 2: trainer.train_epoch(dataloaders["train3"]) trainer.validate(dataloaders["dev"]) elif i < (2 + 4 + 8): trainer.train_epoch(dataloaders["train5"]) trainer.validate(dataloaders["dev"]) elif i < (2 + 4 + 8 + 16): trainer.train_epoch(dataloaders["train10"]) trainer.validate(dataloaders["dev"]) else: trainer.train_epoch(dataloaders["train15"]) trainer.validate(dataloaders["dev"]) # final test to know WER trainer.test(dataloaders["test"])
def train(argv): parser = argparse.ArgumentParser(description="DeepSpeech AM with fully supervised training") # for training parser.add_argument('--data-path', default='/d1/jbaik/ics-asr/data', type=str, help="dataset path to use in training") parser.add_argument('--min-len', default=1., type=float, help="min length of utterance to use in secs") parser.add_argument('--max-len', default=15., type=float, help="max length of utterance to use in secs") parser.add_argument('--batch-size', default=32, type=int, help="number of images (and labels) to be considered in a batch") parser.add_argument('--num-workers', default=32, type=int, help="number of dataloader workers") parser.add_argument('--num-epochs', default=100, type=int, help="number of epochs to run") parser.add_argument('--init-lr', default=0.01, type=float, help="initial learning rate for Adam optimizer") parser.add_argument('--max-norm', default=400, type=int, help="norm cutoff to prevent explosion of gradients") # optional parser.add_argument('--use-cuda', default=False, action='store_true', help="use cuda") parser.add_argument('--fp16', default=False, action='store_true', help="use FP16 model") parser.add_argument('--visdom', default=False, action='store_true', help="use visdom logging") parser.add_argument('--visdom-host', default="127.0.0.1", type=str, help="visdom server ip address") parser.add_argument('--visdom-port', default=8097, type=int, help="visdom server port") parser.add_argument('--tensorboard', default=False, action='store_true', help="use tensorboard logging") parser.add_argument('--slack', default=False, action='store_true', help="use slackclient logging (need to set SLACK_API_TOKEN and SLACK_API_USER env_var") parser.add_argument('--seed', default=None, type=int, help="seed for controlling randomness in this example") parser.add_argument('--log-dir', default='./logs_deepspeech_ctc', type=str, help="filename for logging the outputs") parser.add_argument('--model-prefix', default='deepspeech_ctc', type=str, help="model file prefix to store") parser.add_argument('--checkpoint', default=True, action='store_true', help="save checkpoint") parser.add_argument('--continue-from', default=None, type=str, help="model file path to make continued from") parser.add_argument('--opt-type', default="sgdr", type=str, help=f"optimizer type in {OPTIMIZER_TYPES}") args = parser.parse_args(argv) init_distributed(args.use_cuda) init_logger(log_file="train.log", rank=get_rank(), **vars(args)) set_seed(args.seed) # prepare trainer object model = DeepSpeech(num_classes=p.NUM_CTC_LABELS) trainer = NonSplitTrainer(model=model, **vars(args)) labeler = trainer.decoder.labeler train_datasets = [ NonSplitTrainDataset(labeler=labeler, manifest_file=f"{args.data_path}/aspire/train.csv"), NonSplitTrainDataset(labeler=labeler, manifest_file=f"{args.data_path}/aspire/dev.csv"), NonSplitTrainDataset(labeler=labeler, manifest_file=f"{args.data_path}/aspire/test.csv"), NonSplitTrainDataset(labeler=labeler, manifest_file=f"{args.data_path}/swbd/train.csv"), ] datasets = { "train": ConcatDataset([AudioSubset(d, data_size=0, min_len=args.min_len, max_len=args.max_len) for d in train_datasets]), "dev" : NonSplitTrainDataset(labeler=labeler, manifest_file=f"{args.data_path}/swbd/eval2000.csv"), "test" : NonSplitTrainDataset(labeler=labeler, manifest_file=f"{args.data_path}/swbd/rt03.csv"), } dataloaders = { "train": NonSplitTrainDataLoader(datasets["train"], sampler=(DistributedSampler(datasets["train"]) if is_distributed() else None), batch_size=args.batch_size, num_workers=args.num_workers, shuffle=(not is_distributed()), pin_memory=args.use_cuda), "dev" : NonSplitTrainDataLoader(datasets["dev"], batch_size=16, num_workers=8, shuffle=False, pin_memory=args.use_cuda), "test" : NonSplitTrainDataLoader(datasets["test"], batch_size=16, num_workers=8, shuffle=False, pin_memory=args.use_cuda), } # run inference for a certain number of epochs for i in range(trainer.epoch, args.num_epochs): trainer.train_epoch(dataloaders["train"]) trainer.validate(dataloaders["dev"]) # final test to know WER trainer.test(dataloaders["test"])
def batch_train(argv): parser = argparse.ArgumentParser( description="ResNet AM with batch training") # for training parser.add_argument('--num-epochs', default=100, type=int, help="number of epochs to run") parser.add_argument('--init-lr', default=1e-4, type=float, help="initial learning rate for Adam optimizer") parser.add_argument('--max-norm', default=400, type=int, help="norm cutoff to prevent explosion of gradients") # optional parser.add_argument('--use-cuda', default=False, action='store_true', help="use cuda") parser.add_argument('--visdom', default=False, action='store_true', help="use visdom logging") parser.add_argument('--visdom-host', default="127.0.0.1", type=str, help="visdom server ip address") parser.add_argument('--visdom-port', default=8097, type=int, help="visdom server port") parser.add_argument('--tensorboard', default=False, action='store_true', help="use tensorboard logging") parser.add_argument('--seed', default=None, type=int, help="seed for controlling randomness in this example") parser.add_argument('--log-dir', default='./logs_resnet_split', type=str, help="filename for logging the outputs") parser.add_argument('--model-prefix', default='resnet_split', type=str, help="model file prefix to store") parser.add_argument('--checkpoint', default=True, action='store_true', help="save checkpoint") parser.add_argument('--continue-from', default=None, type=str, help="model file path to make continued from") parser.add_argument('--opt-type', default="sgdr", type=str, help=f"optimizer type in {OPTIMIZER_TYPES}") args = parser.parse_args(argv) set_logfile(Path(args.log_dir, "train.log")) version_log(args) set_seed(args.seed) # prepare trainer object model = resnet50(num_classes=p.NUM_CTC_LABELS) trainer = SplitTrainer(model, **vars(args)) labeler = trainer.decoder.labeler train_datasets = [ SplitTrainDataset(labeler=labeler, manifest_file="data/aspire/train.csv"), SplitTrainDataset(labeler=labeler, manifest_file="data/aspire/dev.csv"), SplitTrainDataset(labeler=labeler, manifest_file="data/aspire/test.csv"), SplitTrainDataset(labeler=labeler, manifest_file="data/swbd/train.csv"), ] datasets = { "train5": ConcatDataset([AudioSubset(d, max_len=5) for d in train_datasets]), "train10": ConcatDataset([AudioSubset(d, max_len=10) for d in train_datasets]), "train15": ConcatDataset([AudioSubset(d, max_len=15) for d in train_datasets]), "dev": SplitTrainDataset(labeler=labeler, manifest_file="data/swbd/eval2000.csv"), "test": SplitTrainDataset(labeler=labeler, manifest_file="data/swbd/rt03.csv"), } dataloaders = { "train5": SplitTrainDataLoader(datasets["train5"], batch_size=1, num_workers=0, shuffle=True, pin_memory=args.use_cuda), "train10": SplitTrainDataLoader(datasets["train10"], batch_size=1, num_workers=0, shuffle=True, pin_memory=args.use_cuda), "train15": SplitTrainDataLoader(datasets["train15"], batch_size=1, num_workers=0, shuffle=True, pin_memory=args.use_cuda), "dev": SplitTrainDataLoader(datasets["dev"], batch_size=1, num_workers=0, shuffle=False, pin_memory=args.use_cuda), "test": SplitTrainDataLoader(datasets["test"], batch_size=1, num_workers=0, shuffle=True, pin_memory=args.use_cuda), } # run inference for a certain number of epochs for i in range(trainer.epoch, args.num_epochs): if i < 10: trainer.train_epoch(dataloaders["train5"]) trainer.validate(dataloaders["dev"]) elif i < 30: trainer.train_epoch(dataloaders["train10"]) trainer.validate(dataloaders["dev"]) else: trainer.train_epoch(dataloaders["train15"]) trainer.validate(dataloaders["dev"]) # final test to know WER trainer.test(dataloaders["test"])