Ejemplo n.º 1
0
def add_optimization_args(parser):
    group = parser.add_argument_group('Optimization')
    group.add_argument('--max-epoch', '--me', default=0, type=int, metavar='N',
                       help='force stop training at specified epoch')
    group.add_argument('--max-update', '--mu', default=0, type=int, metavar='N',
                       help='force stop training at specified update')
    group.add_argument('--target-bleu', default=0.0, type=float, metavar='TARGET',
                       help='force stop training after reaching target bleu')
    group.add_argument('--clip-norm', default=25, type=float, metavar='NORM',
                       help='clip threshold of gradients')
    group.add_argument('--sentence-avg', action='store_true',
                       help='normalize gradients by the number of sentences in a batch'
                            ' (default is to normalize by number of tokens)')
    group.add_argument('--update-freq', default='1', metavar='N',
                       help='update parameters every N_i batches, when in epoch i')

    # Optimizer definitions can be found under fairseq/optim/
    group.add_argument('--optimizer', default='nag', metavar='OPT',
                       choices=OPTIMIZER_REGISTRY.keys(),
                       help='optimizer: {} (default: nag)'.format(', '.join(OPTIMIZER_REGISTRY.keys())))
    group.add_argument('--lr', '--learning-rate', default='0.25', metavar='LR_1,LR_2,...,LR_N',
                       help='learning rate for the first N epochs; all epochs >N using LR_N'
                            ' (note: this may be interpreted differently depending on --lr-scheduler)')
    group.add_argument('--momentum', default=0.99, type=float, metavar='M',
                       help='momentum factor')
    group.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
                       help='weight decay')

    # Learning rate schedulers can be found under fairseq/optim/lr_scheduler/
    group.add_argument('--lr-scheduler', default='reduce_lr_on_plateau',
                       help='learning rate scheduler: {} (default: reduce_lr_on_plateau)'.format(
                           ', '.join(LR_SCHEDULER_REGISTRY.keys())))
    group.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS',
                       help='learning rate shrink factor for annealing, lr_new = (lr * lr_shrink)')
    group.add_argument('--min-lr', default=1e-5, type=float, metavar='LR',
                       help='minimum learning rate')
    group.add_argument('--min-loss-scale', default=1e-4, type=float, metavar='D',
                       help='minimum loss scale (for FP16 training)')

    # Criterion args
    parser.add_argument('--label-smoothing', default=0., type=float, metavar='D',
                        help='epsilon for label smoothing, 0 means no label smoothing')

    # Parallel backward + all-reduce optimization
    group.add_argument('--enable-parallel-backward-allred-opt', action='store_true',
                       help='enable all reduce of w-gradients in parallel with backward propagation (only for FP16 training)')
    group.add_argument('--parallel-backward-allred-opt-threshold', type=int, default=0, metavar='N',
                       help='min num of contiguous gradient elements before all-reduce is triggered')
    group.add_argument('--enable-parallel-backward-allred-opt-correctness-check', action='store_true',
                       help='compare w-gradient values obtained doing all-reduce in parallel vs. at the end')

    return group
Ejemplo n.º 2
0
def add_optimization_args(parser):
    group = parser.add_argument_group('Optimization')
    # fmt: off
    group.add_argument('--max-epoch', '--me', default=0, type=int, metavar='N',
                       help='force stop training at specified epoch')
    group.add_argument('--max-update', '--mu', default=0, type=int, metavar='N',
                       help='force stop training at specified update')
    group.add_argument('--clip-norm', default=25, type=float, metavar='NORM',
                       help='clip threshold of gradients')
    group.add_argument('--sentence-avg', action='store_true',
                       help='normalize gradients by the number of sentences in a batch'
                            ' (default is to normalize by number of tokens)')
    group.add_argument('--update-freq', default='1', metavar='N1,N2,...,N_K',
                       type=lambda uf: eval_str_list(uf, type=int),
                       help='update parameters every N_i batches, when in epoch i')

    # Optimizer definitions can be found under fairseq/optim/
    group.add_argument('--optimizer', default='nag', metavar='OPT',
                       choices=OPTIMIZER_REGISTRY.keys(),
                       help='Optimizer')
    group.add_argument('--lr', '--learning-rate', default='0.25', type=eval_str_list,
                       metavar='LR_1,LR_2,...,LR_N',
                       help='learning rate for the first N epochs; all epochs >N using LR_N'
                            ' (note: this may be interpreted differently depending on --lr-scheduler)')

    # Learning rate schedulers can be found under fairseq/optim/lr_scheduler/
    group.add_argument('--lr-scheduler', default='fixed',
                       choices=LR_SCHEDULER_REGISTRY.keys(),
                       help='Learning Rate Scheduler')
    group.add_argument('--min-lr', default=-1, type=float, metavar='LR',
                       help='stop training when the learning rate reaches this minimum')
    # fmt: on
    return group
Ejemplo n.º 3
0
def add_optimization_args(parser):
    group = parser.add_argument_group('Optimization')
    group.add_argument('--max-epoch', '--me', default=0, type=int, metavar='N',
                       help='force stop training at specified epoch')
    group.add_argument('--max-update', '--mu', default=0, type=int, metavar='N',
                       help='force stop training at specified update')
    group.add_argument('--clip-norm', default=25, type=float, metavar='NORM',
                       help='clip threshold of gradients')
    group.add_argument('--sentence-avg', action='store_true',
                       help='normalize gradients by the number of sentences in a batch'
                            ' (default is to normalize by number of tokens)')
    group.add_argument('--update-freq', default='1', metavar='N',
                       help='update parameters every N_i batches, when in epoch i')

    # Optimizer definitions can be found under fairseq/optim/
    group.add_argument('--optimizer', default='nag', metavar='OPT',
                       choices=OPTIMIZER_REGISTRY.keys(),
                       help='optimizer: {} (default: nag)'.format(', '.join(OPTIMIZER_REGISTRY.keys())))
    group.add_argument('--lr', '--learning-rate', default='0.25', metavar='LR_1,LR_2,...,LR_N',
                       help='learning rate for the first N epochs; all epochs >N using LR_N'
                            ' (note: this may be interpreted differently depending on --lr-scheduler)')
    group.add_argument('--momentum', default=0.99, type=float, metavar='M',
                       help='momentum factor')
    group.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
                       help='weight decay')

    # Learning rate schedulers can be found under fairseq/optim/lr_scheduler/
    group.add_argument('--lr-scheduler', default='reduce_lr_on_plateau',
                       help='learning rate scheduler: {} (default: reduce_lr_on_plateau)'.format(
                           ', '.join(LR_SCHEDULER_REGISTRY.keys())))
    group.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS',
                       help='learning rate shrink factor for annealing, lr_new = (lr * lr_shrink)')
    group.add_argument('--min-lr', default=1e-5, type=float, metavar='LR',
                       help='minimum learning rate')
    group.add_argument('--min-loss-scale', default=1e-4, type=float, metavar='D',
                       help='minimum loss scale (for FP16 training)')
    group.add_argument('--adv_bias', default=8000, type=int, metavar='N',
                       help='threshold for rare-pop words')
    group.add_argument('--adv_lambda', default=0.0, type=float, metavar='D',
                       help='weight of adversarial loss')
    group.add_argument('--adv_lr', default=0.01, type=float, metavar='D',
                       help='weight of adversarial loss')
    group.add_argument('--adv_wdecay', default=0.0, type=float, metavar='D',
                       help='weight of adversarial loss')
    group.add_argument('--adv_updates', default=1, type=int, metavar='N',
                       help='weight of adversarial loss')
    return group
Ejemplo n.º 4
0
def add_optimization_args(parser):
    group = parser.add_argument_group('Optimization')
    # fmt: off
    group.add_argument('--max-epoch', '--me', default=0, type=int, metavar='N',
                       help='force stop training at specified epoch')
    group.add_argument('--max-update', '--mu', default=0, type=int, metavar='N',
                       help='force stop training at specified update')
    group.add_argument('--clip-norm', default=25, type=float, metavar='NORM',
                       help='clip threshold of gradients')
    group.add_argument('--sentence-avg', action='store_true',
                       help='normalize gradients by the number of sentences in a batch'
                            ' (default is to normalize by number of tokens)')
    group.add_argument('--update-freq', default='1', metavar='N1,N2,...,N_K',
                       type=lambda uf: eval_str_list(uf, type=int),
                       help='update parameters every N_i batches, when in epoch i')
    group.add_argument('--ema-decay', default=0.9999, type=float, metavar='D',
                       help='exponetail moving average decay')
    group.add_argument('--no-ema', action='store_true', 
                       help='disable exponetial moving average')

    # Optimizer definitions can be found under fairseq/optim/
    group.add_argument('--optimizer', default='nag', metavar='OPT',
                       choices=OPTIMIZER_REGISTRY.keys(),
                       help='Optimizer')
    group.add_argument('--lr', '--learning-rate', default='0.25', type=eval_str_list,
                       metavar='LR_1,LR_2,...,LR_N',
                       help='learning rate for the first N epochs; all epochs >N using LR_N'
                            ' (note: this may be interpreted differently depending on --lr-scheduler)')
    group.add_argument('--momentum', default=0.99, type=float, metavar='M',
                       help='momentum factor')
    group.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
                       help='weight decay')

    # Learning rate schedulers can be found under fairseq/optim/lr_scheduler/
    group.add_argument('--lr-scheduler', default='reduce_lr_on_plateau',
                       choices=LR_SCHEDULER_REGISTRY.keys(),
                       help='Learning Rate Scheduler')
    group.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS',
                       help='learning rate shrink factor for annealing, lr_new = (lr * lr_shrink)')
    group.add_argument('--min-lr', default=1e-5, type=float, metavar='LR',
                       help='minimum learning rate')
    # fmt: on
    return group
Ejemplo n.º 5
0
def add_optimization_args(parser):
    group = parser.add_argument_group('Optimization')
    group.add_argument('--max-epoch', '--me', default=0, type=int, metavar='N',
                       help='force stop training at specified epoch')
    group.add_argument('--max-update', '--mu', default=0, type=int, metavar='N',
                       help='force stop training at specified update')
    group.add_argument('--clip-norm', default=25, type=float, metavar='NORM',
                       help='clip threshold of gradients')
    group.add_argument('--sentence-avg', action='store_true',
                       help='normalize gradients by the number of sentences in a batch'
                            ' (default is to normalize by number of tokens)')
    group.add_argument('--update-freq', default='1', metavar='N',
                       help='update parameters every N_i batches, when in epoch i')

    # Optimizer definitions can be found under fairseq/optim/
    group.add_argument('--optimizer', default='nag', metavar='OPT',
                       choices=OPTIMIZER_REGISTRY.keys(),
                       help='Optimizer')
    group.add_argument('--lr', '--learning-rate', default='0.25', metavar='LR_1,LR_2,...,LR_N',
                       help='learning rate for the first N epochs; all epochs >N using LR_N'
                            ' (note: this may be interpreted differently depending on --lr-scheduler)')
    group.add_argument('--momentum', default=0.99, type=float, metavar='M',
                       help='momentum factor')
    group.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
                       help='weight decay')

    # Learning rate schedulers can be found under fairseq/optim/lr_scheduler/
    group.add_argument('--lr-scheduler', default='reduce_lr_on_plateau',
                       choices=LR_SCHEDULER_REGISTRY.keys(),
                       help='Learning Rate Scheduler')
    group.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS',
                       help='learning rate shrink factor for annealing, lr_new = (lr * lr_shrink)')
    group.add_argument('--min-lr', default=1e-5, type=float, metavar='LR',
                       help='minimum learning rate')
    group.add_argument('--min-loss-scale', default=1e-4, type=float, metavar='D',
                       help='minimum loss scale (for FP16 training)')

    return group
Ejemplo n.º 6
0
def add_optimization_args(parser):
    group = parser.add_argument_group('Optimization')
    group.add_argument('--max-epoch',
                       '--me',
                       default=0,
                       type=int,
                       metavar='N',
                       help='force stop training at specified epoch')
    group.add_argument('--max-update',
                       '--mu',
                       default=0,
                       type=int,
                       metavar='N',
                       help='force stop training at specified update')
    group.add_argument('--clip-norm',
                       default=25,
                       type=float,
                       metavar='NORM',
                       help='clip threshold of gradients')
    group.add_argument(
        '--sentence-avg',
        action='store_true',
        help='normalize gradients by the number of sentences in a batch'
        ' (default is to normalize by number of tokens)')

    # Optimizer definitions can be found under fairseq/optim/
    group.add_argument('--optimizer',
                       default='nag',
                       metavar='OPT',
                       choices=OPTIMIZER_REGISTRY.keys(),
                       help='optimizer: {} (default: nag)'.format(', '.join(
                           OPTIMIZER_REGISTRY.keys())))
    group.add_argument(
        '--lr',
        '--learning-rate',
        default='0.25',
        metavar='LR_1,LR_2,...,LR_N',
        help='learning rate for the first N epochs; all epochs >N using LR_N'
        ' (note: this may be interpreted differently depending on --lr-scheduler)'
    )
    group.add_argument('--momentum',
                       default=0.99,
                       type=float,
                       metavar='M',
                       help='momentum factor')
    group.add_argument('--weight-decay',
                       '--wd',
                       default=0.0,
                       type=float,
                       metavar='WD',
                       help='weight decay')

    # Learning rate schedulers can be found under fairseq/optim/lr_scheduler/
    group.add_argument(
        '--lr-scheduler',
        default='reduce_lr_on_plateau',
        help='learning rate scheduler: {} (default: reduce_lr_on_plateau)'.
        format(', '.join(LR_SCHEDULER_REGISTRY.keys())))
    group.add_argument(
        '--lr-shrink',
        default=0.1,
        type=float,
        metavar='LS',
        help=
        'learning rate shrink factor for annealing, lr_new = (lr * lr_shrink)')
    group.add_argument('--min-lr',
                       default=1e-5,
                       type=float,
                       metavar='LR',
                       help='minimum learning rate')

    group.add_argument(
        '--sample-without-replacement',
        default=0,
        type=int,
        metavar='N',
        help='If bigger than 0, use that number of mini-batches for each epoch,'
        ' where each sample is drawn randomly without replacement from the'
        ' dataset')
    group.add_argument('--curriculum',
                       default=0,
                       type=int,
                       metavar='N',
                       help='sort batches by source length for first N epochs')
    return group
Ejemplo n.º 7
0
def add_optimization_args(parser):
    group = parser.add_argument_group('Optimization')
    group.add_argument('--max-epoch',
                       '--me',
                       default=0,
                       type=int,
                       metavar='N',
                       help='force stop training at specified epoch')
    group.add_argument('--max-update',
                       '--mu',
                       default=0,
                       type=int,
                       metavar='N',
                       help='force stop training at specified update')
    group.add_argument('--clip-norm',
                       default=25,
                       type=float,
                       metavar='NORM',
                       help='clip threshold of gradients')
    group.add_argument(
        '--sentence-avg',
        action='store_true',
        help='normalize gradients by the number of sentences in a batch'
        ' (default is to normalize by number of tokens)')
    group.add_argument(
        '--update-freq',
        default='1',
        metavar='N',
        help='update parameters every N_i batches, when in epoch i')
    group.add_argument('--assistant',
                       action='store_true',
                       help='whether use assistant as batch sampler')
    group.add_argument(
        '--batch_method',
        type=str,
        default='sentences',
        help='the method to distribute instances [sentences, bins]')
    group.add_argument('--use-tfidf',
                       action='store_true',
                       help='whether use tf-idf version of assistant')
    group.add_argument('--spl', action='store_true', help='whether use spl')

    # Optimizer definitions can be found under fairseq/optim/
    group.add_argument('--optimizer',
                       default='nag',
                       metavar='OPT',
                       choices=OPTIMIZER_REGISTRY.keys(),
                       help='Optimizer')
    group.add_argument(
        '--lr',
        '--learning-rate',
        default='0.25',
        metavar='LR_1,LR_2,...,LR_N',
        help='learning rate for the first N epochs; all epochs >N using LR_N'
        ' (note: this may be interpreted differently depending on --lr-scheduler)'
    )
    group.add_argument('--momentum',
                       default=0.99,
                       type=float,
                       metavar='M',
                       help='momentum factor')
    group.add_argument('--weight-decay',
                       '--wd',
                       default=0.0,
                       type=float,
                       metavar='WD',
                       help='weight decay')

    # Learning rate schedulers can be found under fairseq/optim/lr_scheduler/
    group.add_argument('--lr-scheduler',
                       default='reduce_lr_on_plateau',
                       choices=LR_SCHEDULER_REGISTRY.keys(),
                       help='Learning Rate Scheduler')
    group.add_argument(
        '--lr-shrink',
        default=0.1,
        type=float,
        metavar='LS',
        help=
        'learning rate shrink factor for annealing, lr_new = (lr * lr_shrink)')
    group.add_argument('--min-lr',
                       default=1e-5,
                       type=float,
                       metavar='LR',
                       help='minimum learning rate')
    group.add_argument('--min-loss-scale',
                       default=1e-4,
                       type=float,
                       metavar='D',
                       help='minimum loss scale (for FP16 training)')

    return group
Ejemplo n.º 8
0
def add_optimization_args(parser):
    group = parser.add_argument_group('Optimization')
    group.add_argument('--max-epoch',
                       '--me',
                       default=0,
                       type=int,
                       metavar='N',
                       help='force stop training at specified epoch')
    group.add_argument('--max-update',
                       '--mu',
                       default=0,
                       type=int,
                       metavar='N',
                       help='force stop training at specified update')
    group.add_argument('--target-bleu',
                       default=0.0,
                       type=float,
                       metavar='TARGET',
                       help='force stop training after reaching target bleu')
    group.add_argument('--clip-norm',
                       default=25,
                       type=float,
                       metavar='NORM',
                       help='clip threshold of gradients')
    group.add_argument(
        '--update-freq',
        default=[1],
        nargs='+',
        type=int,
        help='update parameters every N_i batches, when in epoch i')

    # Optimizer definitions can be found under fairseq/optim/
    group.add_argument('--optimizer',
                       default='nag',
                       metavar='OPT',
                       choices=OPTIMIZER_REGISTRY.keys(),
                       help='optimizer: {} (default: nag)'.format(', '.join(
                           OPTIMIZER_REGISTRY.keys())))
    group.add_argument(
        '--lr',
        '--learning-rate',
        default=[0.25],
        nargs='+',
        type=float,
        help='learning rate for the first N epochs; all epochs >N using LR_N'
        ' (note: this may be interpreted differently depending on --lr-scheduler)'
    )
    group.add_argument('--momentum',
                       default=0.99,
                       type=float,
                       metavar='M',
                       help='momentum factor')
    group.add_argument('--weight-decay',
                       '--wd',
                       default=0.0,
                       type=float,
                       metavar='WD',
                       help='weight decay')

    # Learning rate schedulers can be found under fairseq/optim/lr_scheduler/
    group.add_argument(
        '--lr-scheduler',
        default='reduce_lr_on_plateau',
        help='learning rate scheduler: {} (default: reduce_lr_on_plateau)'.
        format(', '.join(LR_SCHEDULER_REGISTRY.keys())))
    group.add_argument(
        '--lr-shrink',
        default=0.1,
        type=float,
        metavar='LS',
        help=
        'learning rate shrink factor for annealing, lr_new = (lr * lr_shrink)')
    group.add_argument('--min-lr',
                       default=1e-5,
                       type=float,
                       metavar='LR',
                       help='minimum learning rate')

    # Criterion args
    parser.add_argument(
        '--label-smoothing',
        default=0.,
        type=float,
        metavar='D',
        help='epsilon for label smoothing, 0 means no label smoothing')

    return group
Ejemplo n.º 9
0
def add_optimization_args(parser):
    group = parser.add_argument_group('Optimization')
    group.add_argument('--max-epoch',
                       '--me',
                       default=-1,
                       type=int,
                       metavar='N',
                       help='force stop training at specified epoch')
    group.add_argument('--max-update',
                       '--mu',
                       default=0,
                       type=int,
                       metavar='N',
                       help='force stop training at specified update')
    group.add_argument('--target-bleu',
                       default=0.0,
                       type=float,
                       metavar='TARGET',
                       help='force stop training after reaching target bleu')
    group.add_argument('--clip-norm',
                       default=25,
                       type=float,
                       metavar='NORM',
                       help='clip threshold of gradients')
    group.add_argument(
        '--sentence-avg',
        action='store_true',
        help='normalize gradients by the number of sentences in a batch'
        ' (default is to normalize by number of tokens)')
    group.add_argument(
        '--update-freq',
        default='1',
        metavar='N',
        help='update parameters every N_i batches, when in epoch i')

    # Optimizer definitions can be found under fairseq/optim/
    group.add_argument('--optimizer',
                       default='nag',
                       metavar='OPT',
                       choices=OPTIMIZER_REGISTRY.keys(),
                       help='optimizer: {} (default: nag)'.format(', '.join(
                           OPTIMIZER_REGISTRY.keys())))
    group.add_argument(
        '--lr',
        '--learning-rate',
        default='0.25',
        metavar='LR_1,LR_2,...,LR_N',
        help='learning rate for the first N epochs; all epochs >N using LR_N'
        ' (note: this may be interpreted differently depending on --lr-scheduler)'
    )
    group.add_argument('--momentum',
                       default=0.99,
                       type=float,
                       metavar='M',
                       help='momentum factor')
    group.add_argument('--weight-decay',
                       '--wd',
                       default=0.0,
                       type=float,
                       metavar='WD',
                       help='weight decay')

    # Distributed weight update parameters
    group.add_argument('--distributed-weight-update',
                       '--dwu',
                       default=0,
                       type=int,
                       metavar='DWU',
                       help='select distributed weight update strategy')
    group.add_argument(
        '--dwu-group-size',
        '--dwugs',
        default=0,
        type=int,
        metavar='DWUGS',
        help=
        'distributed weight update group size. If arg is 0, defaults to one node'
    )
    group.add_argument('--dwu-num-blocks',
                       '--dwunb',
                       default=8,
                       type=int,
                       metavar='DWUNB',
                       help='number of blocks in dwu scheme')
    group.add_argument('--dwu-num-chunks',
                       '--dwunc',
                       default=8,
                       type=int,
                       metavar='DWUNC',
                       help='number of chunks in dwu scheme')
    group.add_argument(
        '--dwu-num-rs-pg',
        '--dwurspg',
        default=2,
        type=int,
        metavar='DWURSPG',
        help='number of reduction-scatter streams in dwu scheme')
    group.add_argument('--dwu-num-ar-pg',
                       '--dwuarpg',
                       default=4,
                       type=int,
                       metavar='DWUARPG',
                       help='number of all-reduce streams in dwu scheme')
    group.add_argument('--dwu-num-ag-pg',
                       '--dwuagpg',
                       default=2,
                       type=int,
                       metavar='DWUAGPG',
                       help='number of all-gather streams in dwu scheme')

    group.add_argument('--dwu-full-pipeline',
                       action='store_true',
                       help='whether to do full or partial pipeline')
    group.add_argument('--dwu-overlap-reductions',
                       action='store_true',
                       help='whether to overlap reductions with backprop')
    group.add_argument('--dwu-compute-L2-grad-norm',
                       action='store_true',
                       help='whether to compute L2 grad norm')
    group.add_argument(
        '--dwu-flat-mt',
        action='store_true',
        help='whether to flatten gradients with multi tensor scale')
    group.add_argument('--dwu-e5m2-allgather',
                       action='store_true',
                       help='do allgather with e5m2 floats')
    group.add_argument(
        '--dwu-do-not-flatten-model',
        action='store_true',
        help='whether it is allowed to flatten model parameters')

    # Learning rate schedulers can be found under fairseq/optim/lr_scheduler/
    group.add_argument(
        '--lr-scheduler',
        default='reduce_lr_on_plateau',
        help='learning rate scheduler: {} (default: reduce_lr_on_plateau)'.
        format(', '.join(LR_SCHEDULER_REGISTRY.keys())))
    group.add_argument(
        '--lr-shrink',
        default=0.1,
        type=float,
        metavar='LS',
        help=
        'learning rate shrink factor for annealing, lr_new = (lr * lr_shrink)')
    group.add_argument('--min-lr',
                       default=1e-5,
                       type=float,
                       metavar='LR',
                       help='minimum learning rate')
    group.add_argument('--min-loss-scale',
                       default=1e-4,
                       type=float,
                       metavar='D',
                       help='minimum loss scale (for FP16 training)')

    # Parallel backward + all-reduce optimization
    group.add_argument(
        '--enable-parallel-backward-allred-opt',
        action='store_true',
        help=
        'enable all-reduce of w-gradients in parallel with backward propagation (only for FP16 training)'
    )
    group.add_argument('--parallel-backward-allred-cuda-nstreams',
                       type=int,
                       default=1,
                       metavar='N',
                       help='num of CUDA streams used for parallel all-reduce')
    group.add_argument(
        '--parallel-backward-allred-opt-threshold',
        type=int,
        default=0,
        metavar='N',
        help=
        'min num of contiguous gradient elements before all-reduce is triggered'
    )
    group.add_argument(
        '--enable-parallel-backward-allred-opt-correctness-check',
        action='store_true',
        help=
        'compare w-gradient values obtained doing all-reduce in parallel vs. at the end'
    )
    group.add_argument('--dataloader-num-workers',
                       type=int,
                       default=1,
                       metavar='N',
                       help='num subprocesses for train data loader')
    group.add_argument('--enable-dataloader-pin-memory',
                       action='store_true',
                       help='enable pin_memory for train data loader')

    return group
Ejemplo n.º 10
0
def add_optimization_args(parser):
    group = parser.add_argument_group('Optimization')
    group.add_argument('--max-epoch',
                       '--me',
                       default=0,
                       type=int,
                       metavar='N',
                       help='force stop training at specified epoch')
    group.add_argument('--max-update',
                       '--mu',
                       default=0,
                       type=int,
                       metavar='N',
                       help='force stop training at specified update')
    group.add_argument('--clip-norm',
                       default=25,
                       type=float,
                       metavar='NORM',
                       help='clip threshold of gradients')
    group.add_argument(
        '--sentence-avg',
        action='store_true',
        help='normalize gradients by the number of sentences in a batch'
        ' (default is to normalize by number of tokens)')
    group.add_argument(
        '--update-freq',
        default='1',
        metavar='N',
        help='update parameters every N_i batches, when in epoch i')

    # Optimizer definitions can be found under fairseq/optim/
    group.add_argument('--optimizer',
                       default='nag',
                       metavar='OPT',
                       choices=OPTIMIZER_REGISTRY.keys(),
                       help='optimizer: {} (default: nag)'.format(', '.join(
                           OPTIMIZER_REGISTRY.keys())))
    group.add_argument(
        '--lr',
        '--learning-rate',
        default='0.25',
        metavar='LR_1,LR_2,...,LR_N',
        help='learning rate for the first N epochs; all epochs >N using LR_N'
        ' (note: this may be interpreted differently depending on --lr-scheduler)'
    )
    group.add_argument('--momentum',
                       default=0.99,
                       type=float,
                       metavar='M',
                       help='momentum factor')
    group.add_argument('--weight-decay',
                       '--wd',
                       default=0.0,
                       type=float,
                       metavar='WD',
                       help='weight decay')

    # Learning rate schedulers can be found under fairseq/optim/lr_scheduler/
    group.add_argument(
        '--lr-scheduler',
        default='reduce_lr_on_plateau',
        help='learning rate scheduler: {} (default: reduce_lr_on_plateau)'.
        format(', '.join(LR_SCHEDULER_REGISTRY.keys())))
    group.add_argument(
        '--lr-shrink',
        default=0.1,
        type=float,
        metavar='LS',
        help=
        'learning rate shrink factor for annealing, lr_new = (lr * lr_shrink)')
    group.add_argument('--min-lr',
                       default=1e-5,
                       type=float,
                       metavar='LR',
                       help='minimum learning rate')
    group.add_argument('--min-loss-scale',
                       default=1e-4,
                       type=float,
                       metavar='D',
                       help='minimum loss scale (for FP16 training)')
    # For rl training specification
    group.add_argument(
        '--multinomial-sample-train',
        default=False,
        action='store_true',
        help=
        'use multinomial sampling instead of beam search for rl training sample'
    )
    group.add_argument(
        '--delta-reward',
        default=False,
        action='store_true',
        help='use delta reward instead of total reward, default is total reward'
    )
    group.add_argument(
        '--mle-weight',
        default=0.0,
        type=float,
        help='use mle loss combined with rl loss, weight of mle loss.')
    group.add_argument(
        '--rl-weight',
        default=1.0,
        type=float,
        help='use mle loss combined with rl loss, weight of rl loss.')
    group.add_argument('--max-order',
                       default=4,
                       type=int,
                       help='max order for gleu')
    group.add_argument(
        '--sample-beam',
        default=5,
        type=int,
        help=
        'number of translation sentences generated by one source sentence in v2 loss'
    )
    group.add_argument(
        '--gram',
        default=0,
        type=int,
        help="if not 0, only count grams with length 'gram' in GLEU computation"
    )
    group.add_argument('--modgleu',
                       default=False,
                       action='store_true',
                       help='use modified version of GLEU computation')
    return group