def parse_args(): """ Parse commandline arguments. """ parser = argparse.ArgumentParser( description='GNMT training', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # dataset dataset = parser.add_argument_group('dataset setup') dataset.add_argument( '--dataset-dir', default=None, required=True, help='path to directory with training/validation data') dataset.add_argument('--max-size', default=None, type=int, help='use at most MAX_SIZE elements from training \ dataset (useful for benchmarking), by default \ uses entire dataset') # results results = parser.add_argument_group('results setup') results.add_argument('--results-dir', default='results', help='path to directory with results, it it will be \ automatically created if does not exist') results.add_argument('--save', default='gnmt_wmt16', help='defines subdirectory within RESULTS_DIR for \ results from this training run') results.add_argument('--print-freq', default=10, type=int, help='print log every PRINT_FREQ batches') # model model = parser.add_argument_group('model setup') model.add_argument('--model-config', default="{'hidden_size': 1024,'num_layers': 4, \ 'dropout': 0.2, 'share_embedding': True}", help='GNMT architecture configuration') model.add_argument('--smoothing', default=0.1, type=float, help='label smoothing, if equal to zero model will use \ CrossEntropyLoss, if not zero model will be trained \ with label smoothing loss') # setup general = parser.add_argument_group('general setup') general.add_argument('--math', default='fp16', choices=['fp32', 'fp16'], help='arithmetic type') general.add_argument('--seed', default=None, type=int, help='set random number generator seed') general.add_argument('--disable-eval', action='store_true', default=False, help='disables validation after every epoch') general.add_argument('--workers', default=0, type=int, help='number of workers for data loading') cuda_parser = general.add_mutually_exclusive_group(required=False) cuda_parser.add_argument( '--cuda', dest='cuda', action='store_true', help='enables cuda (use \'--no-cuda\' to disable)') cuda_parser.add_argument('--no-cuda', dest='cuda', action='store_false', help=argparse.SUPPRESS) cuda_parser.set_defaults(cuda=True) cudnn_parser = general.add_mutually_exclusive_group(required=False) cudnn_parser.add_argument( '--cudnn', dest='cudnn', action='store_true', help='enables cudnn (use \'--no-cudnn\' to disable)') cudnn_parser.add_argument('--no-cudnn', dest='cudnn', action='store_false', help=argparse.SUPPRESS) cudnn_parser.set_defaults(cudnn=True) # training training = parser.add_argument_group('training setup') training.add_argument('--batch-size', default=128, type=int, help='batch size for training') training.add_argument('--epochs', default=8, type=int, help='number of total epochs to run') training.add_argument('--optimization-config', default="{'optimizer': 'Adam', 'lr': 5e-4}", type=str, help='optimizer config') training.add_argument('--grad-clip', default=5.0, type=float, help='enabled gradient clipping and sets maximum \ gradient norm value') training.add_argument('--max-length-train', default=50, type=int, help='maximum sequence length for training') training.add_argument('--min-length-train', default=0, type=int, help='minimum sequence length for training') bucketing_parser = training.add_mutually_exclusive_group(required=False) bucketing_parser.add_argument( '--bucketing', dest='bucketing', action='store_true', help='enables bucketing (use \'--no-bucketing\' to disable)') bucketing_parser.add_argument('--no-bucketing', dest='bucketing', action='store_false', help=argparse.SUPPRESS) bucketing_parser.set_defaults(bucketing=True) # validation validation = parser.add_argument_group('validation setup') validation.add_argument('--val-batch-size', default=128, type=int, help='batch size for validation') validation.add_argument('--max-length-val', default=80, type=int, help='maximum sequence length for validation') validation.add_argument('--min-length-val', default=0, type=int, help='minimum sequence length for validation') # test test = parser.add_argument_group('test setup') test.add_argument('--test-batch-size', default=128, type=int, help='batch size for test') test.add_argument('--max-length-test', default=150, type=int, help='maximum sequence length for test') test.add_argument('--min-length-test', default=0, type=int, help='minimum sequence length for test') test.add_argument('--beam-size', default=5, type=int, help='beam size') test.add_argument('--len-norm-factor', default=0.6, type=float, help='length normalization factor') test.add_argument('--cov-penalty-factor', default=0.1, type=float, help='coverage penalty factor') test.add_argument('--len-norm-const', default=5.0, type=float, help='length normalization constant') test.add_argument('--target-bleu', default=None, type=float, help='target accuracy') test.add_argument('--intra-epoch-eval', default=0, type=int, help='evaluate within epoch') # checkpointing checkpoint = parser.add_argument_group('checkpointing setup') checkpoint.add_argument('--start-epoch', default=0, type=int, help='manually set initial epoch counter') checkpoint.add_argument('--resume', default=None, type=str, metavar='PATH', help='resumes training from checkpoint from PATH') checkpoint.add_argument('--save-all', action='store_true', default=False, help='saves checkpoint after every epoch') checkpoint.add_argument('--save-freq', default=5000, type=int, help='save checkpoint every SAVE_FREQ batches') checkpoint.add_argument( '--keep-checkpoints', default=0, type=int, help='keep only last KEEP_CHECKPOINTS checkpoints, \ affects only checkpoints controlled by --save-freq \ option') # distributed support distributed = parser.add_argument_group('distributed setup') distributed.add_argument( '--rank', default=0, type=int, help='rank of the process, do not set! Done by multiproc module') distributed.add_argument( '--world-size', default=1, type=int, help='number of processes, do not set! Done by multiproc module') distributed.add_argument('--dist-url', default='tcp://localhost:23456', type=str, help='url used to set up distributed training') return parser.parse_args()
def parse_args(): """ Parse commandline arguments. """ def exclusive_group(group, name, default, help): destname = name.replace('-', '_') subgroup = group.add_mutually_exclusive_group(required=False) subgroup.add_argument(f'--{name}', dest=f'{destname}', action='store_true', help=f'{help} (use \'--no-{name}\' to disable)') subgroup.add_argument(f'--no-{name}', dest=f'{destname}', action='store_false', help=argparse.SUPPRESS) subgroup.set_defaults(**{destname: default}) parser = argparse.ArgumentParser( description='GNMT training', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # dataset dataset = parser.add_argument_group('dataset setup') dataset.add_argument('--dataset-dir', default='data/wmt16_de_en', help='path to the directory with training/test data') dataset.add_argument('--max-size', default=None, type=int, help='use at most MAX_SIZE elements from training \ dataset (useful for benchmarking), by default \ uses entire dataset') # results results = parser.add_argument_group('results setup') results.add_argument('--results-dir', default='results', help='path to directory with results, it it will be \ automatically created if does not exist') results.add_argument('--save', default='gnmt', help='defines subdirectory within RESULTS_DIR for \ results from this training run') results.add_argument('--print-freq', default=10, type=int, help='print log every PRINT_FREQ batches') # model model = parser.add_argument_group('model setup') model.add_argument('--hidden-size', default=1024, type=int, help='model hidden size') model.add_argument('--num-layers', default=4, type=int, help='number of RNN layers') model.add_argument('--dropout', default=0.2, type=float, help='dropout') exclusive_group(group=model, name='share-embedding', default=True, help='share embedding') model.add_argument('--smoothing', default=0.1, type=float, help='label smoothing, if equal to zero model will use \ CrossEntropyLoss, if not zero model will be trained \ with label smoothing loss') # setup general = parser.add_argument_group('general setup') general.add_argument('--math', default='fp16', choices=['fp16', 'fp32'], help='arithmetic type') general.add_argument('--seed', default=None, type=int, help='set random number generator seed') exclusive_group(group=general, name='eval', default=True, help='run validation and test after every epoch') exclusive_group(group=general, name='env', default=True, help='print info about execution env') exclusive_group(group=general, name='cuda', default=True, help='enables cuda') exclusive_group(group=general, name='cudnn', default=True, help='enables cudnn') # training training = parser.add_argument_group('training setup') training.add_argument('--train-batch-size', default=128, type=int, help='training batch size per worker') training.add_argument('--train-global-batch-size', default=None, type=int, help='global training batch size') training.add_argument('--train-iter-size', default=1, type=int, help='training iter size') training.add_argument('--epochs', default=6, type=int, help='number of training epochs') training.add_argument('--grad-clip', default=5.0, type=float, help='enables gradient clipping and sets maximum \ gradient norm value') training.add_argument('--max-length-train', default=50, type=int, help='maximum sequence length for training') training.add_argument('--min-length-train', default=0, type=int, help='minimum sequence length for training') training.add_argument('--train-loader-workers', default=2, type=int, help='number of workers for training data loading') training.add_argument('--batching', default='sharding', type=str, choices=['random', 'sharding', 'bucketing'], help='select batching algorithm') training.add_argument('--shard-size', default=80, type=int, help='shard size for "sharding" batching algorithm, \ in multiples of global batch size') training.add_argument('--num-buckets', default=5, type=int, help='number of buckets for "bucketing" batching \ algorithm') # optimizer optimizer = parser.add_argument_group('optimizer setup') optimizer.add_argument('--optimizer', type=str, default='Adam', help='training optimizer') optimizer.add_argument('--lr', type=float, default=1.00e-3, help='learning rate') optimizer.add_argument('--optimizer-extra', type=str, default="{}", help='extra options for the optimizer') # scheduler scheduler = parser.add_argument_group('learning rate scheduler setup') scheduler.add_argument('--warmup-steps', type=str, default='200', help='number of learning rate warmup iterations') scheduler.add_argument('--remain-steps', type=str, default='0.666', help='starting iteration for learning rate decay') scheduler.add_argument('--decay-interval', type=str, default='None', help='interval between learning rate decay steps') scheduler.add_argument('--decay-steps', type=int, default=4, help='max number of learning rate decay steps') scheduler.add_argument('--decay-factor', type=float, default=0.5, help='learning rate decay factor') # validation val = parser.add_argument_group('validation setup') val.add_argument('--val-batch-size', default=128, type=int, help='batch size for validation') val.add_argument('--max-length-val', default=80, type=int, help='maximum sequence length for validation') val.add_argument('--min-length-val', default=0, type=int, help='minimum sequence length for validation') val.add_argument('--val-loader-workers', default=0, type=int, help='number of workers for validation data loading') # test test = parser.add_argument_group('test setup') test.add_argument('--test-batch-size', default=128, type=int, help='batch size for test') test.add_argument('--max-length-test', default=150, type=int, help='maximum sequence length for test') test.add_argument('--min-length-test', default=0, type=int, help='minimum sequence length for test') test.add_argument('--beam-size', default=5, type=int, help='beam size') test.add_argument('--len-norm-factor', default=0.6, type=float, help='length normalization factor') test.add_argument('--cov-penalty-factor', default=0.1, type=float, help='coverage penalty factor') test.add_argument('--len-norm-const', default=5.0, type=float, help='length normalization constant') test.add_argument('--intra-epoch-eval', default=0, type=int, help='evaluate within epoch') test.add_argument('--test-loader-workers', default=0, type=int, help='number of workers for test data loading') # checkpointing chkpt = parser.add_argument_group('checkpointing setup') chkpt.add_argument('--start-epoch', default=0, type=int, help='manually set initial epoch counter') chkpt.add_argument('--resume', default=None, type=str, metavar='PATH', help='resumes training from checkpoint from PATH') chkpt.add_argument('--save-all', action='store_true', default=False, help='saves checkpoint after every epoch') chkpt.add_argument('--save-freq', default=5000, type=int, help='save checkpoint every SAVE_FREQ batches') chkpt.add_argument('--keep-checkpoints', default=0, type=int, help='keep only last KEEP_CHECKPOINTS checkpoints, \ affects only checkpoints controlled by --save-freq \ option') # benchmarking benchmark = parser.add_argument_group('benchmark setup') benchmark.add_argument('--target-perf', default=None, type=float, help='target training performance (in tokens \ per second)') benchmark.add_argument('--target-bleu', default=None, type=float, help='target accuracy') # distributed distributed = parser.add_argument_group('distributed setup') distributed.add_argument('--rank', default=0, type=int, help='global rank of the process, do not set!') distributed.add_argument('--local_rank', default=0, type=int, help='local rank of the process, do not set!') args = parser.parse_args() args.warmup_steps = literal_eval(args.warmup_steps) args.remain_steps = literal_eval(args.remain_steps) args.decay_interval = literal_eval(args.decay_interval) return args
def parse_args(): """ Parse commandline arguments. """ def exclusive_group(group, name, default, help): destname = name.replace('-', '_') subgroup = group.add_mutually_exclusive_group(required=False) subgroup.add_argument(f'--{name}', dest=f'{destname}', action='store_true', help=f'{help} (use \'--no-{name}\' to disable)') subgroup.add_argument(f'--no-{name}', dest=f'{destname}', action='store_false', help=argparse.SUPPRESS) subgroup.set_defaults(**{destname: default}) parser = argparse.ArgumentParser( description='GNMT training', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # dataset dataset = parser.add_argument_group('dataset setup') dataset.add_argument('--dataset-dir', default='data/wmt16_de_en', help='path to the directory with training/test data') dataset.add_argument('--src-lang', default='en', help='source language') dataset.add_argument('--tgt-lang', default='de', help='target language') dataset.add_argument('--vocab', default='vocab.bpe.32000', help='path to the vocabulary file \ (relative to DATASET_DIR directory)') dataset.add_argument('-bpe', '--bpe-codes', default='bpe.32000', help='path to the file with bpe codes \ (relative to DATASET_DIR directory)') dataset.add_argument('--train-src', default='train.tok.clean.bpe.32000.en', help='path to the training source data file \ (relative to DATASET_DIR directory)') dataset.add_argument('--train-tgt', default='train.tok.clean.bpe.32000.de', help='path to the training target data file \ (relative to DATASET_DIR directory)') dataset.add_argument('--val-src', default='newstest_dev.tok.clean.bpe.32000.en', help='path to the validation source data file \ (relative to DATASET_DIR directory)') dataset.add_argument('--val-tgt', default='newstest_dev.tok.clean.bpe.32000.de', help='path to the validation target data file \ (relative to DATASET_DIR directory)') dataset.add_argument('--test-src', default='newstest2014.tok.bpe.32000.en', help='path to the test source data file \ (relative to DATASET_DIR directory)') dataset.add_argument('--test-tgt', default='newstest2014.de', help='path to the test target data file \ (relative to DATASET_DIR directory)') # results results = parser.add_argument_group('results setup') results.add_argument('--results-dir', default='results', help='path to directory with results, it will be \ automatically created if it does not exist') results.add_argument('--save-dir', default='gnmt', help='defines subdirectory within RESULTS_DIR for \ results from this training run') results.add_argument('--print-freq', default=10, type=int, help='print log every PRINT_FREQ batches') # model model = parser.add_argument_group('model setup') model.add_argument('--hidden-size', default=1024, type=int, help='hidden size of the model') model.add_argument('--num-layers', default=4, type=int, help='number of RNN layers in encoder and in decoder') model.add_argument('--dropout', default=0.2, type=float, help='dropout applied to input of RNN cells') exclusive_group(group=model, name='share-embedding', default=True, help='use shared embeddings for encoder and decoder') model.add_argument('--smoothing', default=0.1, type=float, help='label smoothing, if equal to zero model will use \ CrossEntropyLoss, if not zero model will be trained \ with label smoothing loss') # setup general = parser.add_argument_group('general setup') general.add_argument('--math', default='fp16', choices=['fp16', 'fp32', 'manual_fp16'], help='precision') general.add_argument('--seed', default=None, type=int, help='master seed for random number generators, if \ "seed" is undefined then the master seed will be \ sampled from random.SystemRandom()') general.add_argument('--prealloc-mode', default='always', type=str, choices=['off', 'once', 'always'], help='controls preallocation') exclusive_group(group=general, name='eval', default=True, help='run validation and test after every epoch') exclusive_group(group=general, name='env', default=True, help='print info about execution env') exclusive_group(group=general, name='cuda', default=True, help='enables cuda') exclusive_group(group=general, name='cudnn', default=True, help='enables cudnn') exclusive_group(group=general, name='log-all-ranks', default=True, help='enables logging from all distributed ranks, if \ disabled then only logs from rank 0 are reported') # training training = parser.add_argument_group('training setup') dataset.add_argument('--train-max-size', default=None, type=int, help='use at most TRAIN_MAX_SIZE elements from \ training dataset (useful for benchmarking), by \ default uses entire dataset') training.add_argument('--train-batch-size', default=128, type=int, help='training batch size per worker') training.add_argument('--train-global-batch-size', default=None, type=int, help='global training batch size, this argument \ does not have to be defined, if it is defined it \ will be used to automatically \ compute train_iter_size \ using the equation: train_iter_size = \ train_global_batch_size // (train_batch_size * \ world_size)') training.add_argument('--train-iter-size', metavar='N', default=1, type=int, help='training iter size, training loop will \ accumulate gradients over N iterations and execute \ optimizer every N steps') training.add_argument('--epochs', default=6, type=int, help='max number of training epochs') training.add_argument('--grad-clip', default=5.0, type=float, help='enables gradient clipping and sets maximum \ norm of gradients') training.add_argument('--train-max-length', default=50, type=int, help='maximum sequence length for training \ (including special BOS and EOS tokens)') training.add_argument('--train-min-length', default=0, type=int, help='minimum sequence length for training \ (including special BOS and EOS tokens)') training.add_argument('--train-loader-workers', default=2, type=int, help='number of workers for training data loading') training.add_argument('--batching', default='bucketing', type=str, choices=['random', 'sharding', 'bucketing'], help='select batching algorithm') training.add_argument('--shard-size', default=80, type=int, help='shard size for "sharding" batching algorithm, \ in multiples of global batch size') training.add_argument('--num-buckets', default=5, type=int, help='number of buckets for "bucketing" batching \ algorithm') # optimizer optimizer = parser.add_argument_group('optimizer setup') optimizer.add_argument('--optimizer', type=str, default='Adam', help='training optimizer') optimizer.add_argument('--lr', type=float, default=2.00e-3, help='learning rate') optimizer.add_argument('--optimizer-extra', type=str, default="{}", help='extra options for the optimizer') # mixed precision loss scaling loss_scaling = parser.add_argument_group( 'mixed precision loss scaling setup') loss_scaling.add_argument('--init-scale', type=float, default=8192, help='initial loss scale') loss_scaling.add_argument('--upscale-interval', type=float, default=128, help='loss upscaling interval') # scheduler scheduler = parser.add_argument_group('learning rate scheduler setup') scheduler.add_argument('--warmup-steps', type=str, default='200', help='number of learning rate warmup iterations') scheduler.add_argument('--remain-steps', type=str, default='0.666', help='starting iteration for learning rate decay') scheduler.add_argument('--decay-interval', type=str, default='None', help='interval between learning rate decay steps') scheduler.add_argument('--decay-steps', type=int, default=4, help='max number of learning rate decay steps') scheduler.add_argument('--decay-factor', type=float, default=0.5, help='learning rate decay factor') # validation val = parser.add_argument_group('validation setup') val.add_argument('--val-batch-size', default=64, type=int, help='batch size for validation') val.add_argument('--val-max-length', default=125, type=int, help='maximum sequence length for validation \ (including special BOS and EOS tokens)') val.add_argument('--val-min-length', default=0, type=int, help='minimum sequence length for validation \ (including special BOS and EOS tokens)') val.add_argument('--val-loader-workers', default=0, type=int, help='number of workers for validation data loading') # test test = parser.add_argument_group('test setup') test.add_argument('--test-batch-size', default=128, type=int, help='batch size for test') test.add_argument('--test-max-length', default=150, type=int, help='maximum sequence length for test \ (including special BOS and EOS tokens)') test.add_argument('--test-min-length', default=0, type=int, help='minimum sequence length for test \ (including special BOS and EOS tokens)') test.add_argument('--beam-size', default=5, type=int, help='beam size') test.add_argument('--len-norm-factor', default=0.6, type=float, help='length normalization factor') test.add_argument('--cov-penalty-factor', default=0.1, type=float, help='coverage penalty factor') test.add_argument('--len-norm-const', default=5.0, type=float, help='length normalization constant') test.add_argument('--intra-epoch-eval', metavar='N', default=0, type=int, help='evaluate within training epoch, this option will \ enable extra N equally spaced evaluations executed \ during each training epoch') test.add_argument('--test-loader-workers', default=0, type=int, help='number of workers for test data loading') # checkpointing chkpt = parser.add_argument_group('checkpointing setup') chkpt.add_argument('--start-epoch', default=0, type=int, help='manually set initial epoch counter') chkpt.add_argument('--resume', default=None, type=str, metavar='PATH', help='resumes training from checkpoint from PATH') chkpt.add_argument('--save-all', action='store_true', default=False, help='saves checkpoint after every epoch') chkpt.add_argument('--save-freq', default=5000, type=int, help='save checkpoint every SAVE_FREQ batches') chkpt.add_argument('--keep-checkpoints', default=0, type=int, help='keep only last KEEP_CHECKPOINTS checkpoints, \ affects only checkpoints controlled by --save-freq \ option') # benchmarking benchmark = parser.add_argument_group('benchmark setup') benchmark.add_argument('--target-perf', default=None, type=float, help='target training performance (in tokens \ per second)') benchmark.add_argument('--target-bleu', default=None, type=float, help='target accuracy') # distributed distributed = parser.add_argument_group('distributed setup') distributed.add_argument('--rank', default=0, type=int, help='global rank of the process, do not set!') distributed.add_argument('--local_rank', default=0, type=int, help='local rank of the process, do not set!') args = parser.parse_args() args.lang = {'src': args.src_lang, 'tgt': args.tgt_lang} args.save_dir = os.path.join(args.results_dir, args.save_dir) args.vocab = os.path.join(args.dataset_dir, args.vocab) args.bpe_codes = os.path.join(args.dataset_dir, args.bpe_codes) args.train_src = os.path.join(args.dataset_dir, args.train_src) args.train_tgt = os.path.join(args.dataset_dir, args.train_tgt) args.val_src = os.path.join(args.dataset_dir, args.val_src) args.val_tgt = os.path.join(args.dataset_dir, args.val_tgt) args.test_src = os.path.join(args.dataset_dir, args.test_src) args.test_tgt = os.path.join(args.dataset_dir, args.test_tgt) args.warmup_steps = literal_eval(args.warmup_steps) args.remain_steps = literal_eval(args.remain_steps) args.decay_interval = literal_eval(args.decay_interval) return args
def parse_args(): """ Parse commandline arguments. """ def exclusive_group(group, name, default, help): destname = name.replace('-', '_') subgroup = group.add_mutually_exclusive_group(required=False) subgroup.add_argument(f'--{name}', dest=f'{destname}', action='store_true', help=f'{help} (use \'--no-{name}\' to disable)') subgroup.add_argument(f'--no-{name}', dest=f'{destname}', action='store_false', help=argparse.SUPPRESS) subgroup.set_defaults(**{destname: default}) parser = argparse.ArgumentParser( description='GNMT training', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # dataset dataset = parser.add_argument_group('dataset setup') dataset.add_argument('--dataset-dir', default='data/wmt16_de_en', help='path to the directory with training/test data') dataset.add_argument('--max-size', default=None, type=int, help='use at most MAX_SIZE elements from training \ dataset (useful for benchmarking), by default \ uses entire dataset') dataset.add_argument('--preproc-data-dir', default='/tmp/preprocessed', help='path to the directory with preprocessed \ training/test data') exclusive_group(group=dataset, name='use-preproc-data', default=True, help='use preprocessed dataset') # results results = parser.add_argument_group('results setup') results.add_argument('--results-dir', default='results', help='path to directory with results, it will be \ automatically created if it does not exist') results.add_argument('--save', default='gnmt', help='defines subdirectory within RESULTS_DIR for \ results from this training run') results.add_argument('--print-freq', default=10, type=int, help='print log every PRINT_FREQ batches') # model model = parser.add_argument_group('model setup') model.add_argument('--hidden-size', default=1024, type=int, help='model hidden size') model.add_argument('--num-layers', default=4, type=int, help='number of RNN layers in encoder and in decoder') model.add_argument('--dropout', default=0.2, type=float, help='dropout applied to input of RNN cells') exclusive_group(group=model, name='share-embedding', default=True, help='use shared embeddings for encoder and decoder') model.add_argument('--smoothing', default=0.1, type=float, help='label smoothing, if equal to zero model will use \ CrossEntropyLoss, if not zero model will be trained \ with label smoothing loss') # setup general = parser.add_argument_group('general setup') general.add_argument('--math', default='fp16', choices=['fp32', 'fp16'], help='arithmetic type') general.add_argument('--seed', default=None, type=int, help='master seed for random number generators, if \ "seed" is undefined then the master seed will be \ sampled from random.SystemRandom()') general.add_argument('--prealloc-mode', default='always', type=str, choices=['off', 'once', 'always'], help='controls preallocation') exclusive_group(group=general, name='eval', default=True, help='run validation and test after every epoch') exclusive_group(group=general, name='env', default=False, help='print info about execution env') exclusive_group(group=general, name='cuda', default=True, help='enables cuda') exclusive_group(group=general, name='cudnn', default=True, help='enables cudnn') exclusive_group(group=general, name='log-all-ranks', default=True, help='enables logging from all distributed ranks, if \ disabled then only logs from rank 0 are reported') exclusive_group(group=general, name='fused-attention', default=False, help='enables fused attention') exclusive_group(group=general, name='fused-xentropy', default=True, help='enables fused cross cross entropy with label \ smoothing') # training training = parser.add_argument_group('training setup') training.add_argument('--train-batch-size', default=128, type=int, help='training batch size per worker') training.add_argument('--train-global-batch-size', default=None, type=int, help='global training batch size, this argument \ does not have to be defined, if it is defined it \ will be used to automatically \ compute train_iter_size \ using the equation: train_iter_size = \ train_global_batch_size // (train_batch_size * \ world_size)') training.add_argument('--train-iter-size', metavar='N', default=1, type=int, help='training iter size, training loop will \ accumulate gradients over N iterations and execute \ optimizer every N steps') training.add_argument('--epochs', default=8, type=int, help='max number of training epochs') training.add_argument('--grad-clip', default=5.0, type=float, help='enables gradient clipping and sets maximum \ norm of gradients') training.add_argument('--max-length-train', default=50, type=int, help='maximum sequence length for training \ (including special BOS and EOS tokens)') training.add_argument('--min-length-train', default=0, type=int, help='minimum sequence length for training \ (including special BOS and EOS tokens)') training.add_argument('--train-loader-workers', default=1, type=int, help='number of workers for training data loading') training.add_argument('--batching', default='bucketing', type=str, choices=['random', 'sharding', 'bucketing'], help='select batching algorithm') training.add_argument('--shard-size', default=80, type=int, help='shard size for "sharding" batching algorithm, \ in multiples of global batch size') training.add_argument('--num-buckets', default=5, type=int, help='number of buckets for "bucketing" batching \ algorithm') # optimizer optimizer = parser.add_argument_group('optimizer setup') optimizer.add_argument('--optimizer', type=str, default='Adam', help='training optimizer') optimizer.add_argument('--lr', type=float, default=1.00e-3, help='learning rate') optimizer.add_argument('--optimizer-extra', type=str, default="{}", help='extra options for the optimizer') # mixed precision loss scaling loss_scaling = parser.add_argument_group('mixed precision loss scaling \ setup') loss_scaling.add_argument('--init-scale', type=float, default=1024, help='initial loss scale') loss_scaling.add_argument('--upscale-interval', type=float, default=128, help='loss upscaling interval') # scheduler scheduler = parser.add_argument_group('learning rate scheduler setup') scheduler.add_argument('--warmup-steps', type=str, default='200', help='number of learning rate warmup iterations') scheduler.add_argument('--remain-steps', type=str, default='0.666', help='starting iteration for learning rate decay') scheduler.add_argument('--decay-interval', type=str, default='None', help='interval between learning rate decay steps') scheduler.add_argument('--decay-steps', type=int, default=4, help='max number of learning rate decay steps') scheduler.add_argument('--decay-factor', type=float, default=0.5, help='learning rate decay factor') # test test = parser.add_argument_group('test setup') test.add_argument('--test-batch-size', default=128, type=int, help='batch size for test') test.add_argument('--max-length-test', default=150, type=int, help='maximum sequence length for test \ (including special BOS and EOS tokens)') test.add_argument('--min-length-test', default=0, type=int, help='minimum sequence length for test \ (including special BOS and EOS tokens)') test.add_argument('--beam-size', default=5, type=int, help='beam size') test.add_argument('--len-norm-factor', default=0.6, type=float, help='length normalization factor') test.add_argument('--cov-penalty-factor', default=0.1, type=float, help='coverage penalty factor') test.add_argument('--len-norm-const', default=5.0, type=float, help='length normalization constant') test.add_argument('--intra-epoch-eval', metavar='N', default=0, type=int, help='evaluate within training epoch, this option will \ enable extra N equally spaced evaluations executed \ during each training epoch') test.add_argument('--test-loader-workers', default=0, type=int, help='number of workers for test data loading') # checkpointing chkpt = parser.add_argument_group('checkpointing setup') chkpt.add_argument('--start-epoch', default=0, type=int, help='manually set initial epoch counter') chkpt.add_argument('--resume', default=None, type=str, metavar='PATH', help='resumes training from checkpoint from PATH') chkpt.add_argument('--save-all', action='store_true', default=False, help='saves checkpoint after every epoch') chkpt.add_argument('--save-freq', default=5000, type=int, help='save checkpoint every SAVE_FREQ batches') chkpt.add_argument('--keep-checkpoints', default=0, type=int, help='keep only last KEEP_CHECKPOINTS checkpoints, \ affects only checkpoints controlled by --save-freq \ option') # benchmarking benchmark = parser.add_argument_group('benchmark setup') benchmark.add_argument('--target-bleu', default=24.0, type=float, help='target accuracy, training will be stopped \ when the target is achieved') # distributed distributed = parser.add_argument_group('distributed setup') distributed.add_argument('--rank', default=0, type=int, help='global rank of the process, do not set!') distributed.add_argument('--local_rank', default=os.getenv('LOCAL_RANK', 0), type=int, help='local rank of the process, do not set!') distributed.add_argument('--enable-apex-allreduce-overlap', action='store_true', default=False, help='enable overlap of allreduce communication \ with bprop') distributed.add_argument('--apex-num-allreduce-streams', default=1, type=int, help='num. allreduce streams') distributed.add_argument('--apex-message-size', default=1e7, type=int, help='min. number of elements in communication \ bucket') # distributed weight update dwu_group = parser.add_argument_group('distributed weight update setup') dwu_group.add_argument('--distributed-weight-update', '--dwu', default=0, type=int, metavar='DWU', help='select distributed weight update strategy') dwu_group.add_argument('--dwu-group-size', '--dwugs', default=0, type=int, metavar='DWUGS', help='distributed weight update group size. If arg is 0, defaults to one node') dwu_group.add_argument('--dwu-num-blocks', '--dwunb', default=8, type=int, metavar='DWUNB', help='number of blocks in dwu scheme') dwu_group.add_argument('--dwu-num-chunks', '--dwuchks', default=4, type=int, help='number of chunks of each parameters block') dwu_group.add_argument('--dwu-num-rs-pg', '--dwurspg', default=2, type=int, metavar='DWURSPG', help='number of reduction-scatter streams in dwu scheme') dwu_group.add_argument('--dwu-num-ar-pg', '--dwuarpg', default=4, type=int, metavar='DWUARPG', help='number of all-reduce streams in dwu scheme') dwu_group.add_argument('--dwu-num-ag-pg', '--dwuagpg', default=2, type=int, metavar='DWUAGPG', help='number of all-gather streams in dwu scheme') dwu_group.add_argument('--dwu-full-pipeline', action='store_true', help='whether to do full or partial pipeline') dwu_group.add_argument('--dwu-overlap-reductions', action='store_true', help='whether to overlap reductions with backprop') dwu_group.add_argument('--dwu-grad-norm', action='store_true', help='whether to compute L2 grad norm') dwu_group.add_argument('--dwu-e5m2-allgather', action='store_true', help='whether to use e5m2 allgather') args = parser.parse_args() args.warmup_steps = literal_eval(args.warmup_steps) args.remain_steps = literal_eval(args.remain_steps) args.decay_interval = literal_eval(args.decay_interval) return args