Esempio n. 1
0
 def add_cmdline_args(argparser):
     """Add command-line arguments specifically for this agent."""
     DictionaryAgent.add_cmdline_args(argparser)
     agent = argparser.add_argument_group('Fairseq Arguments')
     agent.add_argument(
         '-tr', '--truncate',
         type=int, default=-1,
         help='truncate input & output lengths to speed up training (may '
              'reduce accuracy). This fixes all input and output to have a '
              'maximum length. This reduces the total amount of padding in '
              'the batches.')
     agent.add_argument(
         '--max-positions',
         default=1024,
         type=int,
         metavar='N',
         help='max number of tokens in the sequence')
     agent.add_argument(
         '--seed',
         default=1,
         type=int,
         metavar='N',
         help='pseudo random number generator seed')
     options.add_optimization_args(argparser)
     options.add_generation_args(argparser)
     options.add_model_args(argparser)
Esempio n. 2
0
def get_training_and_generation_parser(default_task='translation'):
    parser = options.get_parser('Trainer', default_task)
    options.add_dataset_args(parser, train=True, gen=True)
    options.add_generation_args(parser)
    options.add_distributed_training_args(parser)
    options.add_model_args(parser)
    options.add_optimization_args(parser)
    options.add_checkpoint_args(parser)
    return parser
Esempio n. 3
0
def get_parser_with_args():
    parser = options.get_parser("Trainer")
    parser.add_argument(
        "--log-verbose",
        action="store_true",
        help="Whether to output more verbose logs for debugging/profiling.",
    )
    pytorch_translate_options.add_dataset_args(parser, train=True, gen=True)
    options.add_distributed_training_args(parser)
    # Adds args related to training (validation and stopping criterions).
    optimization_group = options.add_optimization_args(parser)
    pytorch_translate_options.expand_optimization_args(optimization_group)
    # Adds args related to checkpointing.
    checkointing_group = options.add_checkpoint_args(parser)
    pytorch_translate_options.expand_checkpointing_args(checkointing_group)
    # Add model related args
    options.add_model_args(parser)
    # Adds args for generating intermediate BLEU eval while training.
    generation_group = options.add_generation_args(parser)
    pytorch_translate_options.expand_generation_args(generation_group,
                                                     train=True)
    # Adds args related to input data files (preprocessing, numberizing, and
    # binarizing text files; creating vocab files)
    pytorch_translate_options.add_preprocessing_args(parser)
    return parser
Esempio n. 4
0
 def add_cmdline_args(argparser):
     """Add command-line arguments specifically for this agent."""
     DictionaryAgent.add_cmdline_args(argparser)
     agent = argparser.add_argument_group('Fairseq Arguments')
     agent.add_argument('--max-positions',
                        default=1024,
                        type=int,
                        metavar='N',
                        help='max number of tokens in the sequence')
     agent.add_argument('--seed',
                        default=1,
                        type=int,
                        metavar='N',
                        help='pseudo random number generator seed')
     options.add_optimization_args(argparser)
     options.add_generation_args(argparser)
     options.add_model_args(argparser)
Esempio n. 5
0
def get_parser_with_args(default_task="pytorch_translate"):
    parser = options.get_parser("Trainer", default_task=default_task)
    pytorch_translate_options.add_verbosity_args(parser, train=True)
    pytorch_translate_options.add_dataset_args(parser, train=True, gen=True)
    options.add_distributed_training_args(parser)
    # Adds args related to training (validation and stopping criterions).
    optimization_group = options.add_optimization_args(parser)
    pytorch_translate_options.expand_optimization_args(optimization_group)
    # Adds args related to checkpointing.
    checkpointing_group = options.add_checkpoint_args(parser)
    pytorch_translate_options.expand_checkpointing_args(checkpointing_group)
    # Add model related args
    options.add_model_args(parser)
    # Adds args for generating intermediate BLEU eval while training.
    generation_group = options.add_generation_args(parser)
    pytorch_translate_options.expand_generation_args(generation_group, train=True)
    # Adds args related to input data files (preprocessing, numberizing, and
    # binarizing text files; creating vocab files)
    pytorch_translate_options.add_preprocessing_args(parser)
    return parser
Esempio n. 6
0
def main():
    parser = options.get_parser('Trainer')
    dataset_args = options.add_dataset_args(parser)
    dataset_args.add_argument('--max-tokens',
                              default=0,
                              type=int,
                              metavar='N',
                              help='maximum number of tokens in a batch')
    dataset_args.add_argument('--batch-size',
                              default=32,
                              type=int,
                              metavar='N',
                              help='batch size')
    dataset_args.add_argument('--test-batch-size',
                              default=32,
                              type=int,
                              metavar='N',
                              help='batch size for test set')
    dataset_args.add_argument('--valid-batch-size',
                              default=32,
                              type=int,
                              metavar='N',
                              help='batch size for validation set')
    dataset_args.add_argument(
        '--train-subset',
        default='train',
        metavar='SPLIT',
        choices=['train', 'valid', 'test'],
        help='data subset to use for training (train, valid, test)')
    dataset_args.add_argument(
        '--valid-subset',
        default='valid',
        metavar='SPLIT',
        help='comma separated list ofdata subsets '
        ' to use for validation (train, valid, valid1,test, test1)')
    dataset_args.add_argument('--test-subset',
                              default='test',
                              metavar='SPLIT',
                              help='comma separated list ofdata subset '
                              'to use for testing (train, valid, test)')
    dataset_args.add_argument(
        '--valid-script',
        nargs='+',
        metavar='PATH',
        help='path to external validation script (optional).')

    options.add_optimization_args(parser)
    options.add_checkpoint_args(parser)
    options.add_model_args(parser)

    args = utils.parse_args_and_arch(parser)
    print(args)

    if args.no_progress_bar:
        progress_bar.enabled = False
        progress_bar.print_interval = args.log_interval

    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)
    torch.manual_seed(args.seed)

    # Setting args.max_tokens to infinity(same as setting to None)
    if args.max_tokens == 0:
        args.max_tokens = None

    # Load dataset
    dataset = data.load_with_check(args.data, args.source_lang,
                                   args.target_lang)
    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args, so that it's saved in checkpoints
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print('| [{}] dictionary: {} types'.format(dataset.src,
                                               len(dataset.src_dict)))
    print('| [{}] dictionary: {} types'.format(dataset.dst,
                                               len(dataset.dst_dict)))
    for split in dataset.splits:
        print('| {} {} {} examples'.format(args.data, split,
                                           len(dataset.splits[split])))

    if not torch.cuda.is_available():
        raise NotImplementedError('Training on CPU is not supported')
    num_gpus = torch.cuda.device_count()

    print('| using {} GPUs (with max tokens per GPU = {})'.format(
        num_gpus, args.max_tokens))

    # Build model
    print('| model {}'.format(args.arch))
    model = utils.build_model(args, dataset)
    criterion = utils.build_criterion(args, dataset)

    # Start multiprocessing
    trainer = MultiprocessingTrainer(args, model)

    # Load the latest checkpoint if one is available
    epoch, batch_offset = trainer.load_checkpoint(
        os.path.join(args.save_dir, args.restore_file))

    # Train until the learning rate gets too small
    val_loss = None
    max_epoch = args.max_epoch or math.inf
    lr = trainer.get_lr()
    train_meter = StopwatchMeter()
    train_meter.start()
    while lr > args.min_lr and epoch <= max_epoch:
        # train for one epoch
        train(args, epoch, batch_offset, trainer, criterion, dataset, num_gpus)

        # evaluate on validate set
        for k, subset in enumerate(args.valid_subset.split(',')):
            val_loss = validate(args, epoch, trainer, criterion, dataset,
                                subset, num_gpus)
            if k == 0:
                if not args.no_save:
                    # save checkpoint
                    trainer.save_checkpoint(
                        args,
                        epoch,
                        0,
                        val_loss,
                        validation_script=args.valid_script)

                # only use first validation loss to update the learning schedule
                lr = trainer.lr_step(val_loss, epoch)

        epoch += 1
        batch_offset = 0
    train_meter.stop()
    print('| done training in {:.1f} seconds'.format(train_meter.sum))

    # Generate on test set and compute BLEU score
    for beam in [1, 5, 10, 20]:
        for subset in args.test_subset.split(','):
            scorer = score_test(args,
                                trainer.get_model(),
                                dataset,
                                subset,
                                beam,
                                cuda_device=(0 if num_gpus > 0 else None))
            print('| Test on {} with beam={}: {}'.format(
                subset, beam, scorer.result_string()))

    # Stop multiprocessing
    trainer.stop()
Esempio n. 7
0
def get_parser_with_args():
    parser = options.get_parser("Trainer")
    options.add_dataset_args(parser, train=True, gen=True)
    options.add_distributed_training_args(parser)
    options.add_optimization_args(parser)
    options.add_checkpoint_args(parser)
    options.add_model_args(parser)
    options.add_generation_args(parser)

    parser.add_argument(
        "--log-verbose",
        action="store_true",
        help="Whether to output more verbose logs for debugging/profiling.",
    )

    # Adds args related to training (validation and stopping criterions).
    group = parser.add_argument_group("Optimization")
    group.add_argument(
        "--subepoch-validate-interval",
        default=0,
        type=int,
        metavar="N",
        help="Calculates loss over the validation set every N batch updates. "
        "Note that validation is done at the end of every epoch regardless. "
        "A value of <= 0 disables this.",
    )
    group.add_argument(
        "--stop-time-hr",
        default=-1,
        type=int,
        metavar="N",
        help="Stops training after N hours have elapsed. "
        "A value of < 0 disables this.",
    )
    group.add_argument(
        "--stop-no-best-validate-loss",
        default=-1,
        type=int,
        metavar="N",
        help="Stops training after N validations have been run without "
        "achieving a better loss than before. Note that this is affected by "
        "--validation-interval in how frequently we run validation in the "
        "first place. A value of < 0 disables this.",
    )
    group.add_argument(
        "--stop-no-best-bleu-eval",
        default=-1,
        type=int,
        metavar="N",
        help="Stops training after N evals have been run without "
        "achieving a better BLEU score than before. Note that this is affected "
        "by --generate-bleu-eval-interval in how frequently we run BLEU eval "
        "in the first place. A value of < 0 disables this.",
    )

    # Adds args related to input data files (preprocessing, numberizing, and
    # binarizing text files; creating vocab files)
    preprocess.add_args(parser)

    # Adds args related to checkpointing.
    group = parser.add_argument_group("Checkpointing")
    group.add_argument(
        "--no-end-of-epoch-checkpoints",
        action="store_true",
        help="Disables saving checkpoints at the end of the epoch. "
        "This differs from --no-save and --no-epoch-checkpoints in that it "
        "still allows for intra-epoch checkpoints if --save-interval is set.",
    )
    group.add_argument(
        "--max-checkpoints-kept",
        default=-1,
        type=int,
        metavar="N",
        help="Keep at most the last N checkpoints file around. "
        "A value < -1 keeps all. "
        "When --generate-bleu-eval-avg-checkpoints is used and is > N, the "
        "number of checkpoints kept around is automatically adjusted "
        "to allow BLEU to work properly.",
    )

    # Adds args for generating intermediate BLEU eval while training.
    # generate.add_args() adds args used by both train.py and the standalone
    # generate binary, while the flags defined here are used only by train.py.
    generate.add_args(parser)
    group = parser.add_argument_group("Generation")
    group.add_argument(
        "--generate-bleu-eval-per-epoch",
        action="store_true",
        help="Whether to generate BLEU score eval after each epoch.",
    )
    group.add_argument(
        "--generate-bleu-eval-interval",
        default=0,
        type=int,
        metavar="N",
        help="Does BLEU eval every N batch updates. Note that "
        "--save-interval also affects this - we can only eval as "
        "frequently as a checkpoint is written. A value of <= 0 "
        "disables this.",
    )
    group.add_argument(
        "--generate-bleu-eval-avg-checkpoints",
        default=1,
        type=int,
        metavar="N",
        help="Maximum number of last N checkpoints to average over when "
        "doing BLEU eval. Must be >= 1.",
    )
    group.add_argument(
        "--continuous-averaging-after-epochs",
        type=int,
        default=-1,
        help=("Average parameter values after each step since previous "
              "checkpoint, beginning after the specified number of epochs. "),
    )

    return parser
Esempio n. 8
0
    def add_cmdline_args(argparser):
        """Add command-line arguments specifically for this agent."""
        # first we need to add the general torch agent operations
        TorchAgent.add_cmdline_args(argparser)

        agent = argparser.add_argument_group('Fairseq Arguments')
        agent.add_argument(
            '--seed',
            default=1,
            type=int,
            metavar='N',
            help='pseudo random number generator seed'
        )
        agent.add_argument(
            '--skip-generation',
            default=False,
            type=bool,
            metavar='BOOL',
            help='Skips test time beam search. Much faster if you only need PPL',
        )

        # Dictionary construction stuff. Using the subclass in case we end up
        # needing any fairseq specific things
        _FairseqDictionary.add_cmdline_args(argparser)

        # Optimization and learning rate schedule specific arguments
        options.add_optimization_args(argparser)
        known_args = argparser.parse_known_args(nohelp=True)[0]
        if hasattr(known_args, "optimizer"):
            optimizer = known_args.optimizer
            opt_group = argparser.add_argument_group(
                '{} optimizer arguments'.format(optimizer)
            )
            optim.OPTIMIZER_REGISTRY[optimizer].add_args(opt_group)
        if hasattr(known_args, "lr_scheduler"):
            lr_scheduler = known_args.lr_scheduler
            lr_group = argparser.add_argument_group(
                '{} scheduler arguments'.format(lr_scheduler)
            )
            optim.lr_scheduler.LR_SCHEDULER_REGISTRY[lr_scheduler].add_args(lr_group)

        # Generation arguments
        options.add_generation_args(argparser)

        # We need to find out the fairseq model-specific options, so grab the
        # architecture stuff and look up its options
        arch_group = options.add_model_args(argparser)
        # Fairseq marks the arch flag as required, but it may be specified
        # by a saved model cache, so we do some weird stuff to undo that
        for a in arch_group._actions:
            if a.dest == "arch":
                a.required = False
                a.default = None
                break
        known_args = argparser.parse_known_args(nohelp=True)[0]
        if hasattr(known_args, "arch") and known_args.arch is not None:
            arch = known_args.arch
            arch_group = argparser.add_argument_group(
                "{} architecture arguments".format(arch)
            )
            models.ARCH_MODEL_REGISTRY[arch].add_args(arch_group)

        # Override a few defaults from within fairseq to more sensible defaults
        argparser.set_defaults(
            clip_norm=0.1,
            adam_betas="(0.9,0.98)"
        )
Esempio n. 9
0
def get_parser_with_args():
    parser = options.get_parser('Trainer')
    options.add_dataset_args(parser, train=True, gen=True)
    options.add_distributed_training_args(parser)
    options.add_optimization_args(parser)
    options.add_checkpoint_args(parser)
    options.add_model_args(parser)
    options.add_generation_args(parser)

    parser.add_argument(
        '--log-verbose',
        action='store_true',
        help='Whether to output more verbose logs for debugging/profiling.',
    )

    # Adds args related to training (validation and stopping criterions).
    group = parser.add_argument_group('Optimization')
    group.add_argument(
        '--subepoch-validate-interval',
        default=0,
        type=int,
        metavar='N',
        help='Calculates loss over the validation set every N batch updates. '
        'Note that validation is done at the end of every epoch regardless. '
        'A value of <= 0 disables this.',
    )
    group.add_argument(
        '--stop-time-hr',
        default=-1,
        type=int,
        metavar='N',
        help='Stops training after N hours have elapsed. '
        'A value of < 0 disables this.',
    )
    group.add_argument(
        '--stop-no-best-validate-loss',
        default=-1,
        type=int,
        metavar='N',
        help='Stops training after N validations have been run without '
        'achieving a better loss than before. Note that this is affected by '
        '--validation-interval in how frequently we run validation in the '
        'first place. A value of < 0 disables this.',
    )
    group.add_argument(
        '--stop-no-best-bleu-eval',
        default=-1,
        type=int,
        metavar='N',
        help='Stops training after N evals have been run without '
        'achieving a better BLEU score than before. Note that this is affected '
        'by --generate-bleu-eval-interval in how frequently we run BLEU eval '
        'in the first place. A value of < 0 disables this.',
    )

    # Args related to dataset.
    group = parser.add_argument_group('Dataset and data loading')
    group.add_argument(
        '--source-vocab-file',
        default='',
        metavar='FILE',
        help='Path to text file representing the fairseq Dictionary to use. '
        'If left empty, the dict is auto-generated from source training data.',
    )
    group.add_argument(
        '--source-max-vocab-size',
        default=-1,
        type=int,
        metavar='N',
        help='If a new vocab file needs to be generated, restrict it to the '
        'top N most common words. If we re-use an existing vocab file, this '
        'flag will have no effect. A value of < 0 means no max size.',
    )
    group.add_argument(
        '--target-vocab-file',
        default='',
        metavar='FILE',
        help='Path to text file representing the fairseq Dictionary to use. '
        'If left empty, the dict is auto-generated from target training data.',
    )
    group.add_argument(
        '--target-max-vocab-size',
        default=-1,
        type=int,
        metavar='N',
        help='If a new vocab file needs to be generated, restrict it to the '
        'top N most common words. If we re-use an existing vocab file, this '
        'flag will have no effect. A value of < 0 means no max size.',
    )
    group.add_argument(
        '--train-source-text-file',
        default='',
        metavar='FILE',
        help='Path to raw text file containing source training examples. '
        'This overrides what would be loaded from the data dir.',
    )
    group.add_argument(
        '--train-target-text-file',
        default='',
        metavar='FILE',
        help='Path to raw text file containing target training examples. '
        'This overrides what would be loaded from the data dir.',
    )
    group.add_argument(
        '--eval-source-text-file',
        default='',
        metavar='FILE',
        help='Path to raw text file containing source eval examples for '
        'calculating validation loss and BLEU eval scores. '
        'This overrides what would be loaded from the data dir.',
    )
    group.add_argument(
        '--eval-target-text-file',
        default='',
        metavar='FILE',
        help='Path to raw text file containing target eval examples for '
        'calculating validation loss and BLEU eval scores. '
        'This overrides what would be loaded from the data dir.',
    )

    # Adds args related to checkpointing.
    group = parser.add_argument_group('Checkpointing')
    group.add_argument(
        '--no-end-of-epoch-checkpoints',
        action='store_true',
        help='Disables saving checkpoints at the end of the epoch. '
        'This differs from --no-save and --no-epoch-checkpoints in that it '
        'still allows for intra-epoch checkpoints if --save-interval is set.')

    # Adds args for generating intermediate BLEU eval while training.
    # generate.add_args() adds args used by both train.py and the standalone
    # generate binary, while the flags defined here are used only by train.py.
    generate.add_args(parser)
    group = parser.add_argument_group('Generation')
    group.add_argument(
        '--generate-bleu-eval-per-epoch',
        action='store_true',
        help='Whether to generate BLEU score eval after each epoch.',
    )
    group.add_argument(
        '--generate-bleu-eval-interval',
        default=0,
        type=int,
        metavar='N',
        help='Does BLEU eval every N batch updates. Note that '
        '--save-interval also affects this - we can only eval as '
        'frequently as a checkpoint is written. A value of <= 0 '
        'disables this.',
    )
    group.add_argument(
        '--generate-bleu-eval-avg-checkpoints',
        default=1,
        type=int,
        metavar='N',
        help='Maximum number of last N checkpoints to average over when '
        'doing BLEU eval. Must be >= 1.',
    )
    group.add_argument(
        '--continuous-averaging-after-epochs',
        type=int,
        default=-1,
        help=('Average parameter values after each step since previous '
              'checkpoint, beginning after the specified number of epochs. '),
    )

    return parser
Esempio n. 10
0
def main():
    parser = options.get_parser('Trainer')
    dataset_args = options.add_dataset_args(parser)
    dataset_args.add_argument('--max-tokens',
                              default=6000,
                              type=int,
                              metavar='N',
                              help='maximum number of tokens in a batch')
    dataset_args.add_argument(
        '--train-subset',
        default='train',
        metavar='SPLIT',
        choices=['train', 'valid', 'test'],
        help='data subset to use for training (train, valid, test)')
    dataset_args.add_argument(
        '--valid-subset',
        default='valid',
        metavar='SPLIT',
        help='comma separated list ofdata subsets '
        ' to use for validation (train, valid, valid1,test, test1)')
    options.add_optimization_args(parser)
    options.add_checkpoint_args(parser)
    options.add_model_args(parser)

    args = utils.parse_args_and_arch(parser)
    print(args)

    if args.no_progress_bar:
        progress_bar.enabled = False
        progress_bar.print_interval = args.log_interval

    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)
    torch.manual_seed(args.seed)

    # Load dataset
    dataset = data.load_with_check(args.data, ['train', 'valid'],
                                   args.source_lang, args.target_lang)
    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args, so that it's saved in checkpoints
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print('| [{}] dictionary: {} types'.format(dataset.src,
                                               len(dataset.src_dict)))
    print('| [{}] dictionary: {} types'.format(dataset.dst,
                                               len(dataset.dst_dict)))
    for split in ['train', 'valid']:
        print('| {} {} {} examples'.format(args.data, split,
                                           len(dataset.splits[split])))

    if not torch.cuda.is_available():
        raise NotImplementedError('Training on CPU is not supported')
    num_gpus = torch.cuda.device_count()

    print('| using {} GPUs (with max tokens per GPU = {})'.format(
        num_gpus, args.max_tokens))

    # Build model and criterion
    model = utils.build_model(args, dataset.src_dict, dataset.dst_dict)
    criterion = utils.build_criterion(args, dataset.src_dict, dataset.dst_dict)
    print('| model {}, criterion {}'.format(args.arch,
                                            criterion.__class__.__name__))

    # Start multiprocessing
    trainer = MultiprocessingTrainer(args, model, criterion)

    # Load the latest checkpoint if one is available
    checkpoint_path = os.path.join(args.save_dir, args.restore_file)
    extra_state = trainer.load_checkpoint(checkpoint_path)
    if extra_state is not None:
        epoch = extra_state['epoch']
        batch_offset = extra_state['batch_offset']
        print('| loaded checkpoint {} (epoch {})'.format(
            checkpoint_path, epoch))
        if batch_offset == 0:
            epoch += 1
    else:
        epoch, batch_offset = 1, 0

    # Train until the learning rate gets too small
    val_loss = None
    max_epoch = args.max_epoch or math.inf
    lr = trainer.get_lr()
    train_meter = StopwatchMeter()
    train_meter.start()
    while lr > args.min_lr and epoch <= max_epoch:
        # train for one epoch
        train(args, epoch, batch_offset, trainer, dataset, num_gpus)

        # evaluate on validate set
        for k, subset in enumerate(args.valid_subset.split(',')):
            val_loss = validate(args, epoch, trainer, dataset, subset,
                                num_gpus)
            if k == 0:
                if not args.no_save:
                    # save checkpoint
                    save_checkpoint(trainer, args, epoch, 0, val_loss)
                # only use first validation loss to update the learning schedule
                lr = trainer.lr_step(val_loss, epoch)

        epoch += 1
        batch_offset = 0
    train_meter.stop()
    print('| done training in {:.1f} seconds'.format(train_meter.sum))

    # Stop multiprocessing
    trainer.stop()
Esempio n. 11
0
    def add_cmdline_args(cls, argparser):
        """Add command-line arguments specifically for this agent."""
        # first we need to add the general torch agent operations
        TorchAgent.add_cmdline_args(argparser)

        agent = argparser.add_argument_group('Fairseq Arguments')
        agent.add_argument('--fp16',
                           default=False,
                           type=bool,
                           help='Use fp16 training')
        agent.add_argument('--seed',
                           default=1,
                           type=int,
                           metavar='N',
                           help='pseudo random number generator seed')
        agent.add_argument(
            '--skip-generation',
            default=False,
            type=bool,
            metavar='BOOL',
            help=
            'Skips test time beam search. Much faster if you only need PPL',
        )

        # Dictionary construction stuff. Using the subclass in case we end up
        # needing any fairseq specific things
        cls.dictionary_class().add_cmdline_args(argparser)

        # Check subargs for generation, optimizers, criterions, archs, etc
        options.add_generation_args(argparser)
        options.add_optimization_args(argparser)

        # make sure we set defaults according to the model before parsing
        argparser.set_defaults(**cls.DEFAULT_OPTIONS)
        known_args = argparser.parse_known_args(nohelp=True)[0]

        if hasattr(known_args, "optimizer"):
            optimizer = known_args.optimizer
            opt_group = argparser.add_argument_group(
                '{} optimizer arguments'.format(optimizer))
            optim.OPTIMIZER_REGISTRY[optimizer].add_args(opt_group)
        if hasattr(known_args, "lr_scheduler"):
            lr_scheduler = known_args.lr_scheduler
            lr_group = argparser.add_argument_group(
                '{} scheduler arguments'.format(lr_scheduler))
            optim.lr_scheduler.LR_SCHEDULER_REGISTRY[lr_scheduler].add_args(
                lr_group)
        # We need to find out the fairseq model-specific options, so grab the
        # architecture stuff and look up its options
        arch_group = options.add_model_args(argparser)
        # Fairseq marks the arch flag as required, but it may be specified
        # by a saved model cache, so we do some weird stuff to undo that
        for a in arch_group._actions:
            if a.dest == "arch":
                a.required = False
                a.default = None
                break

        # make sure we set defaults according to parlai model before parsing
        argparser.set_defaults(**cls.DEFAULT_OPTIONS)
        known_args = argparser.parse_known_args(nohelp=True)[0]

        if hasattr(known_args, "arch") and known_args.arch is not None:
            arch = known_args.arch
            arch_group = argparser.add_argument_group(
                "{} architecture arguments".format(arch))
            models.ARCH_MODEL_REGISTRY[arch].add_args(arch_group)

        if hasattr(known_args, "criterion"):
            crit_group = argparser.add_argument_group(
                '{} criterion arguments'.format(known_args.criterion))
            criterions.CRITERION_REGISTRY[known_args.criterion].add_args(
                crit_group)

        # As one final check, let's make sure we set defaults correctly
        argparser.set_defaults(**cls.DEFAULT_OPTIONS)
Esempio n. 12
0
    def add_cmdline_args(cls, argparser):
        """Add command-line arguments specifically for this agent."""
        # first we need to add the general torch agent operations
        super(FairseqAgent, cls).add_cmdline_args(argparser)

        # let's store any defaults that were overridden
        old_defaults = argparser._defaults
        if 'clip_norm' not in old_defaults:
            # fairseq has a few awful defaults
            old_defaults['clip_norm'] = 1.0
        if 'optimizer' not in old_defaults:
            old_defaults['optimizer'] = 'adam'
            old_defaults['adam_betas'] = '(0.9,0.98)'

        agent = argparser.add_argument_group('Fairseq Arguments')
        agent.add_argument('--fp16',
                           default=False,
                           type='bool',
                           help='Use fp16 training')
        agent.add_argument(
            '--fp16-init-scale',
            default=2**7,
            type=int,
            help='default FP16 loss scale',
        )
        agent.add_argument(
            '--seed',
            default=1,
            type=int,
            metavar='N',
            help='pseudo random number generator seed',
        )
        agent.add_argument(
            '--skip-generation',
            default=False,
            type='bool',
            metavar='BOOL',
            help=
            'Skips test time beam search. Much faster if you only need PPL',
        )

        # Check subargs for generation, optimizers, criterions, archs, etc
        options.add_generation_args(argparser)
        options.add_optimization_args(argparser)
        options.add_checkpoint_args(argparser)

        # restore any user set defaults that fairseq possibly overrode
        argparser.set_defaults(**old_defaults)
        known_args = argparser.parse_known_args(nohelp=True)[0]

        if hasattr(known_args, "optimizer"):
            optimizer = known_args.optimizer
            opt_group = argparser.add_argument_group(
                '{} optimizer arguments'.format(optimizer))
            optim.OPTIMIZER_REGISTRY[optimizer].add_args(opt_group)
        if hasattr(known_args, "lr_scheduler"):
            lr_scheduler = known_args.lr_scheduler
            lr_group = argparser.add_argument_group(
                '{} scheduler arguments'.format(lr_scheduler))
            optim.lr_scheduler.LR_SCHEDULER_REGISTRY[lr_scheduler].add_args(
                lr_group)
        # We need to find out the fairseq model-specific options, so grab the
        # architecture stuff and look up its options
        arch_group = options.add_model_args(argparser)
        # Fairseq marks the arch flag as required, but it may be specified
        # by a saved model cache, so we do some weird stuff to undo that
        for a in arch_group._actions:
            if a.dest == "arch":
                a.required = False
                a.default = None
                break

        # once again restore any user-set defaults
        argparser.set_defaults(**old_defaults)
        known_args = argparser.parse_known_args(nohelp=True)[0]

        if hasattr(known_args, "arch") and known_args.arch is not None:
            arch = known_args.arch
            arch_group = argparser.add_argument_group(
                "{} architecture arguments".format(arch))
            models.ARCH_MODEL_REGISTRY[arch].add_args(arch_group)

        if hasattr(known_args, "criterion"):
            crit_group = argparser.add_argument_group(
                '{} criterion arguments'.format(known_args.criterion))
            criterions.CRITERION_REGISTRY[known_args.criterion].add_args(
                crit_group)

        # one last time, restore any user set defaults
        argparser.set_defaults(**old_defaults)
    def from_checkpoint(self,
                        checkpoint,
                        roberta_cache_path=None,
                        inspector=None):
        '''
        Initialize model from checkpoint
        '''

        # load fairseq task
        parser = options.get_interactive_generation_parser()
        options.add_optimization_args(parser)
        args = options.parse_args_and_arch(parser, input_args=['--data dummy'])

        # Read extra arguments
        model_folder = os.path.dirname(checkpoint.split(':')[0])
        # config with fairseq-preprocess and fairseq-train args
        config_json = f'{model_folder}/config.json'
        assert os.path.isfile(config_json), \
            "Model trained with v0.3.0 or above?"
        with open(config_json) as fid:
            extra_args = json.loads(fid.read())
        prepro_args = extra_args['fairseq_preprocess_args']
        train_args = extra_args['fairseq_train_args']
        # extra args by hand
        args.source_lang = 'en'
        args.target_lang = 'actions'
        args.path = checkpoint
        args.roberta_cache_path = roberta_cache_path
        dim = train_args['--pretrained-embed-dim'][0]
        args.model_overrides = \
            "{'pretrained_embed_dim':%s, 'task': 'translation'}" % dim
        assert bool(args.left_pad_source), "Only left pad supported"

        # dictionaries
        src_dict_path = f'{model_folder}/dict.{args.source_lang}.txt'
        tgt_dict_path = f'{model_folder}/dict.{args.target_lang}.txt'
        assert os.path.isfile(src_dict_path), \
            f"Missing {src_dict_path}.\nModel trained with v0.3.0 or above?"\
            "\ncheck scripts/stack-transformer/update_model_to_v0.3.0.sh"
        assert os.path.isfile(tgt_dict_path), \
            f"Missing {tgt_dict_path}.\nModel trained with v0.3.0 or above?"\
            "\ncheck scripts/stack-transformer/update_model_to_v0.3.0.sh"
        src_dict = Dictionary.load(src_dict_path)
        tgt_dict = Dictionary.load(tgt_dict_path)

        use_cuda = torch.cuda.is_available() and not args.cpu

        # Override task to ensure compatibility with old models and overide
        # TODO: Task may not be even needed
        task = TranslationTask(args, src_dict, tgt_dict)
        model = load_models(args, task, use_cuda)

        # Load RoBERTa
        embeddings = PretrainedEmbeddings(
            name=prepro_args['--pretrained-embed'][0],
            bert_layers=[int(x) for x in prepro_args['--bert-layers']]
            if '--bert-layers' in prepro_args else None,
            model=load_roberta(name=prepro_args['--pretrained-embed'][0],
                               roberta_cache_path=args.roberta_cache_path,
                               roberta_use_gpu=use_cuda))

        print("Finished loading models")

        # State machine variables
        machine_rules = f'{model_folder}/train.rules.json'
        assert os.path.isfile(machine_rules), f"Missing {machine_rules}"
        machine_type = prepro_args['--machine-type'][0]

        return self(model,
                    machine_rules,
                    machine_type,
                    src_dict,
                    tgt_dict,
                    use_cuda,
                    embeddings=embeddings,
                    inspector=inspector)
Esempio n. 14
0
def main():
    parser = options.get_parser('Trainer')
    dataset_args = options.add_dataset_args(parser)
    dataset_args.add_argument('--max-tokens',
                              default=6000,
                              type=int,
                              metavar='N',
                              help='maximum number of tokens in a batch')
    dataset_args.add_argument('--max-sentences',
                              type=int,
                              metavar='N',
                              help='maximum number of sentences in a batch')
    dataset_args.add_argument(
        '--train-subset',
        default='train',
        metavar='SPLIT',
        choices=['train', 'valid', 'test'],
        help='data subset to use for training (train, valid, test)')
    dataset_args.add_argument(
        '--valid-subset',
        default='valid',
        metavar='SPLIT',
        help='comma separated list of data subsets '
        ' to use for validation (train, valid, valid1,test, test1)')
    dataset_args.add_argument(
        '--max-sentences-valid',
        type=int,
        metavar='N',
        help='maximum number of sentences in a validation batch')
    options.add_optimization_args(parser)
    options.add_checkpoint_args(parser)
    options.add_model_args(parser)

    args = utils.parse_args_and_arch(parser)

    if args.no_progress_bar and args.log_format is None:
        args.log_format = 'simple'

    if args.max_sentences_valid is None:
        args.max_sentences_valid = args.max_sentences

    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)
    torch.manual_seed(args.seed)

    # Load dataset
    splits = ['train', 'valid']
    if data.has_binary_files(args.data, splits):
        dataset = data.load_dataset(args.data, splits, args.source_lang,
                                    args.target_lang)
    else:
        dataset = data.load_raw_text_dataset(args.data, splits,
                                             args.source_lang,
                                             args.target_lang)
    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args, so that it's saved in checkpoints
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    if not torch.cuda.is_available():
        raise NotImplementedError('Training on CPU is not supported')
    args.num_gpus = torch.cuda.device_count()

    print(args)
    print('| [{}] dictionary: {} types'.format(dataset.src,
                                               len(dataset.src_dict)))
    print('| [{}] dictionary: {} types'.format(dataset.dst,
                                               len(dataset.dst_dict)))
    for split in splits:
        print('| {} {} {} examples'.format(args.data, split,
                                           len(dataset.splits[split])))

    print(
        '| using {} GPUs (with max tokens per GPU = {} and max sentences per GPU = {})'
        .format(args.num_gpus, args.max_tokens, args.max_sentences))

    # Build model and criterion
    model = utils.build_model(args, dataset.src_dict, dataset.dst_dict)
    criterion = utils.build_criterion(args, dataset.src_dict, dataset.dst_dict)
    print('| model {}, criterion {}'.format(args.arch,
                                            criterion.__class__.__name__))
    print('| num. model params: {}'.format(
        sum(p.data.numel() for p in model.parameters())))

    # The max number of positions can be different for train and valid
    # e.g., RNNs may support more positions at test time than seen in training
    max_positions_train = (min(args.max_source_positions,
                               model.max_encoder_positions()),
                           min(args.max_target_positions,
                               model.max_decoder_positions()))
    max_positions_valid = (model.max_encoder_positions(),
                           model.max_decoder_positions())

    # Start multiprocessing
    trainer = MultiprocessingTrainer(args, model, criterion)

    # Create files to save losses
    traincsv_path = os.path.join(args.save_dir, 'train_losses.csv')
    validcsv_path = os.path.join(args.save_dir, 'valid_losses.csv')
    output_path = [traincsv_path, validcsv_path]
    for path in output_path:
        with open(path, 'w+') as csvfile:
            csvwriter = csv.writer(csvfile, delimiter=',')
            csvwriter.writerow(['Epoch', 'Perplexity', 'Loss'])
            csvfile.close()

    # Load the latest checkpoint if one is available
    checkpoint_path = os.path.join(args.save_dir, args.restore_file)
    extra_state = trainer.load_checkpoint(checkpoint_path)
    if extra_state is not None:
        epoch = extra_state['epoch']
        batch_offset = extra_state['batch_offset']
        print('| loaded checkpoint {} (epoch {})'.format(
            checkpoint_path, epoch))
        if batch_offset == 0:
            epoch += 1
    else:
        epoch, batch_offset = 1, 0

    # Train until the learning rate gets too small
    val_loss = None
    max_epoch = args.max_epoch or math.inf
    lr = trainer.get_lr()
    train_meter = StopwatchMeter()
    train_meter.start()
    while lr > args.min_lr and epoch <= max_epoch:
        # train for one epoch
        train(args, epoch, batch_offset, trainer, dataset, max_positions_train,
              traincsv_path)

        # evaluate on validate set
        for k, subset in enumerate(args.valid_subset.split(',')):
            val_loss = validate(args, epoch, trainer, dataset,
                                max_positions_valid, subset, validcsv_path)
            if k == 0:
                if not args.no_save:
                    # save checkpoint
                    save_checkpoint(trainer, args, epoch, 0, val_loss)
                # only use first validation loss to update the learning schedule
                lr = trainer.lr_step(val_loss, epoch)

        epoch += 1
        batch_offset = 0
    train_meter.stop()
    print('| done training in {:.1f} seconds'.format(train_meter.sum))

    # Stop multiprocessing
    trainer.stop()
Esempio n. 15
0
def main():
    parser = options.get_parser('Trainer')
    dataset_args = options.add_dataset_args(parser)
    dataset_args.add_argument('--max-tokens', default=6000, type=int, metavar='N',
                              help='maximum number of tokens in a batch')
    dataset_args.add_argument('--max-sentences', type=int, metavar='N',
                              help='maximum number of sentences in a batch')
    dataset_args.add_argument('--train-subset', default='train', metavar='SPLIT',
                              choices=['train', 'valid', 'test'],
                              help='data subset to use for training (train, valid, test)')
    dataset_args.add_argument('--valid-subset', default='valid', metavar='SPLIT',
                              help='comma separated list of data subsets '
                                   ' to use for validation (train, valid, valid1,test, test1)')
    options.add_optimization_args(parser)
    options.add_checkpoint_args(parser)
    options.add_model_args(parser)

    args = utils.parse_args_and_arch(parser)

    if args.no_progress_bar and args.log_format is None:
        args.log_format = 'simple'

    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)
    torch.manual_seed(args.seed)

    # Load dataset
    splits = ['train', 'valid']
    if data.has_binary_files(args.data, splits):
        dataset = data.load_dataset(args.data, splits, args.source_lang, args.target_lang)
    else:
        dataset = data.load_raw_text_dataset(args.data, splits, args.source_lang, args.target_lang)
    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args, so that it's saved in checkpoints
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print(args)
    print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict)))
    print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict)))
    for split in splits:
        print('| {} {} {} examples'.format(args.data, split, len(dataset.splits[split])))

    if not torch.cuda.is_available():
        raise NotImplementedError('Training on CPU is not supported')
    num_gpus = torch.cuda.device_count()

    print('| using {} GPUs (with max tokens per GPU = {} and max sentences per GPU = {})'.format(
        num_gpus, args.max_tokens, args.max_sentences))

    # Build model and criterion
    model = utils.build_model(args, dataset.src_dict, dataset.dst_dict)
    criterion = utils.build_criterion(args, dataset.src_dict, dataset.dst_dict)
    print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__))

    # The max number of positions can be different for train and valid
    # e.g., RNNs may support more positions at test time than seen in training
    max_positions_train = (args.max_source_positions, args.max_target_positions)
    max_positions_valid = (
        min(args.max_source_positions, model.max_encoder_positions()),
        min(args.max_target_positions, model.max_decoder_positions())
    )

    # Start multiprocessing
    trainer = MultiprocessingTrainer(args, model, criterion)

    # Load the latest checkpoint if one is available
    checkpoint_path = os.path.join(args.save_dir, args.restore_file)
    extra_state = trainer.load_checkpoint(checkpoint_path)
    if extra_state is not None:
        epoch = extra_state['epoch']
        batch_offset = extra_state['batch_offset']
        print('| loaded checkpoint {} (epoch {})'.format(checkpoint_path, epoch))
        if batch_offset == 0:
            epoch += 1
    else:
        epoch, batch_offset = 1, 0

    # Train until the learning rate gets too small
    val_loss = None
    max_epoch = args.max_epoch or math.inf
    lr = trainer.get_lr()
    train_meter = StopwatchMeter()
    train_meter.start()
    while lr > args.min_lr and epoch <= max_epoch:
        # train for one epoch
        train(args, epoch, batch_offset, trainer, dataset, max_positions_train, num_gpus)

        # evaluate on validate set
        for k, subset in enumerate(args.valid_subset.split(',')):
            val_loss = validate(args, epoch, trainer, dataset, max_positions_valid, subset, num_gpus)
            if k == 0:
                if not args.no_save:
                    # save checkpoint
                    save_checkpoint(trainer, args, epoch, 0, val_loss)
                # only use first validation loss to update the learning schedule
                lr = trainer.lr_step(val_loss, epoch)

        epoch += 1
        batch_offset = 0
    train_meter.stop()
    print('| done training in {:.1f} seconds'.format(train_meter.sum))

    # Stop multiprocessing
    trainer.stop()
Esempio n. 16
0
def get_parser_with_args():
    parser = options.get_parser("Trainer")
    options.add_dataset_args(parser, train=True, gen=True)
    options.add_distributed_training_args(parser)
    options.add_optimization_args(parser)
    options.add_checkpoint_args(parser)
    options.add_model_args(parser)
    options.add_generation_args(parser)

    parser.add_argument(
        "--log-verbose",
        action="store_true",
        help="Whether to output more verbose logs for debugging/profiling.",
    )

    # Adds args related to training (validation and stopping criterions).
    group = parser.add_argument_group("Optimization")
    group.add_argument(
        "--subepoch-validate-interval",
        default=0,
        type=int,
        metavar="N",
        help="Calculates loss over the validation set every N batch updates. "
        "Note that validation is done at the end of every epoch regardless. "
        "A value of <= 0 disables this.",
    )
    group.add_argument(
        "--stop-time-hr",
        default=-1,
        type=int,
        metavar="N",
        help="Stops training after N hours have elapsed. "
        "A value of < 0 disables this.",
    )
    group.add_argument(
        "--stop-no-best-validate-loss",
        default=-1,
        type=int,
        metavar="N",
        help="Stops training after N validations have been run without "
        "achieving a better loss than before. Note that this is affected by "
        "--validation-interval in how frequently we run validation in the "
        "first place. A value of < 0 disables this.",
    )
    group.add_argument(
        "--stop-no-best-bleu-eval",
        default=-1,
        type=int,
        metavar="N",
        help="Stops training after N evals have been run without "
        "achieving a better BLEU score than before. Note that this is affected "
        "by --generate-bleu-eval-interval in how frequently we run BLEU eval "
        "in the first place. A value of < 0 disables this.",
    )

    # Args related to dataset.
    group = parser.add_argument_group("Dataset and data loading")
    group.add_argument(
        "--source-vocab-file",
        default="",
        metavar="FILE",
        help="Path to text file representing the fairseq Dictionary to use. "
        "If left empty, the dict is auto-generated from source training data.",
    )
    group.add_argument(
        "--source-max-vocab-size",
        default=-1,
        type=int,
        metavar="N",
        help="If a new vocab file needs to be generated, restrict it to the "
        "top N most common words. If we re-use an existing vocab file, this "
        "flag will have no effect. A value of < 0 means no max size.",
    )
    group.add_argument(
        "--target-vocab-file",
        default="",
        metavar="FILE",
        help="Path to text file representing the fairseq Dictionary to use. "
        "If left empty, the dict is auto-generated from target training data.",
    )
    group.add_argument(
        "--target-max-vocab-size",
        default=-1,
        type=int,
        metavar="N",
        help="If a new vocab file needs to be generated, restrict it to the "
        "top N most common words. If we re-use an existing vocab file, this "
        "flag will have no effect. A value of < 0 means no max size.",
    )
    group.add_argument(
        "--train-source-text-file",
        default="",
        metavar="FILE",
        help="Path to raw text file containing source training examples. "
        "This overrides what would be loaded from the data dir.",
    )
    group.add_argument(
        "--train-target-text-file",
        default="",
        metavar="FILE",
        help="Path to raw text file containing target training examples. "
        "This overrides what would be loaded from the data dir.",
    )
    group.add_argument(
        "--eval-source-text-file",
        default="",
        metavar="FILE",
        help="Path to raw text file containing source eval examples for "
        "calculating validation loss and BLEU eval scores. "
        "This overrides what would be loaded from the data dir.",
    )
    group.add_argument(
        "--eval-target-text-file",
        default="",
        metavar="FILE",
        help="Path to raw text file containing target eval examples for "
        "calculating validation loss and BLEU eval scores. "
        "This overrides what would be loaded from the data dir.",
    )
    group.add_argument(
        "--penalized-target-tokens-file",
        default="",
        metavar="FILE",
        help="Path to text file of tokens to receive a penalty in decoding."
        "If left empty, no penalty will be applied",
    )

    # Adds args related to checkpointing.
    group = parser.add_argument_group("Checkpointing")
    group.add_argument(
        "--no-end-of-epoch-checkpoints",
        action="store_true",
        help="Disables saving checkpoints at the end of the epoch. "
        "This differs from --no-save and --no-epoch-checkpoints in that it "
        "still allows for intra-epoch checkpoints if --save-interval is set.",
    )
    group.add_argument(
        "--max-checkpoints-kept",
        default=-1,
        type=int,
        metavar="N",
        help="Keep at most the last N checkpoints file around. "
        "A value < -1 keeps all. "
        "When --generate-bleu-eval-avg-checkpoints is used and is > N, the "
        "number of checkpoints kept around is automatically adjusted "
        "to allow BLEU to work properly.",
    )

    # Adds args for generating intermediate BLEU eval while training.
    # generate.add_args() adds args used by both train.py and the standalone
    # generate binary, while the flags defined here are used only by train.py.
    generate.add_args(parser)
    group = parser.add_argument_group("Generation")
    group.add_argument(
        "--generate-bleu-eval-per-epoch",
        action="store_true",
        help="Whether to generate BLEU score eval after each epoch.",
    )
    group.add_argument(
        "--generate-bleu-eval-interval",
        default=0,
        type=int,
        metavar="N",
        help="Does BLEU eval every N batch updates. Note that "
        "--save-interval also affects this - we can only eval as "
        "frequently as a checkpoint is written. A value of <= 0 "
        "disables this.",
    )
    group.add_argument(
        "--generate-bleu-eval-avg-checkpoints",
        default=1,
        type=int,
        metavar="N",
        help="Maximum number of last N checkpoints to average over when "
        "doing BLEU eval. Must be >= 1.",
    )
    group.add_argument(
        "--continuous-averaging-after-epochs",
        type=int,
        default=-1,
        help=("Average parameter values after each step since previous "
              "checkpoint, beginning after the specified number of epochs. "),
    )

    return parser