def get_parser_with_args(): parser = options.get_parser("Trainer", default_task="pytorch_translate") pytorch_translate_options.add_verbosity_args(parser, train=True) pytorch_translate_options.add_dataset_args(parser, train=True, gen=True) options.add_distributed_training_args(parser) # Adds args related to training (validation and stopping criterions). optimization_group = options.add_optimization_args(parser) pytorch_translate_options.expand_optimization_args(optimization_group) # Adds args related to checkpointing. checkpointing_group = options.add_checkpoint_args(parser) pytorch_translate_options.expand_checkpointing_args(checkpointing_group) # Add model related args options.add_model_args(parser) # Adds args for generating intermediate BLEU eval while training. generation_group = options.add_generation_args(parser) pytorch_translate_options.expand_generation_args(generation_group, train=True) # Adds args related to input data files (preprocessing, numberizing, and # binarizing text files; creating vocab files) pytorch_translate_options.add_preprocessing_args(parser) return parser
def get_parser_with_args(): parser = options.get_parser("Trainer") parser.add_argument( "--log-verbose", action="store_true", help="Whether to output more verbose logs for debugging/profiling.", ) pytorch_translate_options.add_dataset_args(parser, train=True, gen=True) options.add_distributed_training_args(parser) # Adds args related to training (validation and stopping criterions). optimization_group = options.add_optimization_args(parser) pytorch_translate_options.expand_optimization_args(optimization_group) # Adds args related to checkpointing. checkointing_group = options.add_checkpoint_args(parser) pytorch_translate_options.expand_checkpointing_args(checkointing_group) # Add model related args options.add_model_args(parser) # Adds args for generating intermediate BLEU eval while training. generation_group = options.add_generation_args(parser) pytorch_translate_options.expand_generation_args(generation_group, train=True) # Adds args related to input data files (preprocessing, numberizing, and # binarizing text files; creating vocab files) pytorch_translate_options.add_preprocessing_args(parser) return parser
def main(): parser = options.get_parser('Trainer') dataset_args = options.add_dataset_args(parser) dataset_args.add_argument('--max-tokens', default=0, type=int, metavar='N', help='maximum number of tokens in a batch') dataset_args.add_argument('--batch-size', default=32, type=int, metavar='N', help='batch size') dataset_args.add_argument('--test-batch-size', default=32, type=int, metavar='N', help='batch size for test set') dataset_args.add_argument('--valid-batch-size', default=32, type=int, metavar='N', help='batch size for validation set') dataset_args.add_argument( '--train-subset', default='train', metavar='SPLIT', choices=['train', 'valid', 'test'], help='data subset to use for training (train, valid, test)') dataset_args.add_argument( '--valid-subset', default='valid', metavar='SPLIT', help='comma separated list ofdata subsets ' ' to use for validation (train, valid, valid1,test, test1)') dataset_args.add_argument('--test-subset', default='test', metavar='SPLIT', help='comma separated list ofdata subset ' 'to use for testing (train, valid, test)') dataset_args.add_argument( '--valid-script', nargs='+', metavar='PATH', help='path to external validation script (optional).') options.add_optimization_args(parser) options.add_checkpoint_args(parser) options.add_model_args(parser) args = utils.parse_args_and_arch(parser) print(args) if args.no_progress_bar: progress_bar.enabled = False progress_bar.print_interval = args.log_interval if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) torch.manual_seed(args.seed) # Setting args.max_tokens to infinity(same as setting to None) if args.max_tokens == 0: args.max_tokens = None # Load dataset dataset = data.load_with_check(args.data, args.source_lang, args.target_lang) if args.source_lang is None or args.target_lang is None: # record inferred languages in args, so that it's saved in checkpoints args.source_lang, args.target_lang = dataset.src, dataset.dst print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) for split in dataset.splits: print('| {} {} {} examples'.format(args.data, split, len(dataset.splits[split]))) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') num_gpus = torch.cuda.device_count() print('| using {} GPUs (with max tokens per GPU = {})'.format( num_gpus, args.max_tokens)) # Build model print('| model {}'.format(args.arch)) model = utils.build_model(args, dataset) criterion = utils.build_criterion(args, dataset) # Start multiprocessing trainer = MultiprocessingTrainer(args, model) # Load the latest checkpoint if one is available epoch, batch_offset = trainer.load_checkpoint( os.path.join(args.save_dir, args.restore_file)) # Train until the learning rate gets too small val_loss = None max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() while lr > args.min_lr and epoch <= max_epoch: # train for one epoch train(args, epoch, batch_offset, trainer, criterion, dataset, num_gpus) # evaluate on validate set for k, subset in enumerate(args.valid_subset.split(',')): val_loss = validate(args, epoch, trainer, criterion, dataset, subset, num_gpus) if k == 0: if not args.no_save: # save checkpoint trainer.save_checkpoint( args, epoch, 0, val_loss, validation_script=args.valid_script) # only use first validation loss to update the learning schedule lr = trainer.lr_step(val_loss, epoch) epoch += 1 batch_offset = 0 train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum)) # Generate on test set and compute BLEU score for beam in [1, 5, 10, 20]: for subset in args.test_subset.split(','): scorer = score_test(args, trainer.get_model(), dataset, subset, beam, cuda_device=(0 if num_gpus > 0 else None)) print('| Test on {} with beam={}: {}'.format( subset, beam, scorer.result_string())) # Stop multiprocessing trainer.stop()
def get_parser_with_args(): parser = options.get_parser("Trainer") options.add_dataset_args(parser, train=True, gen=True) options.add_distributed_training_args(parser) options.add_optimization_args(parser) options.add_checkpoint_args(parser) options.add_model_args(parser) options.add_generation_args(parser) parser.add_argument( "--log-verbose", action="store_true", help="Whether to output more verbose logs for debugging/profiling.", ) # Adds args related to training (validation and stopping criterions). group = parser.add_argument_group("Optimization") group.add_argument( "--subepoch-validate-interval", default=0, type=int, metavar="N", help="Calculates loss over the validation set every N batch updates. " "Note that validation is done at the end of every epoch regardless. " "A value of <= 0 disables this.", ) group.add_argument( "--stop-time-hr", default=-1, type=int, metavar="N", help="Stops training after N hours have elapsed. " "A value of < 0 disables this.", ) group.add_argument( "--stop-no-best-validate-loss", default=-1, type=int, metavar="N", help="Stops training after N validations have been run without " "achieving a better loss than before. Note that this is affected by " "--validation-interval in how frequently we run validation in the " "first place. A value of < 0 disables this.", ) group.add_argument( "--stop-no-best-bleu-eval", default=-1, type=int, metavar="N", help="Stops training after N evals have been run without " "achieving a better BLEU score than before. Note that this is affected " "by --generate-bleu-eval-interval in how frequently we run BLEU eval " "in the first place. A value of < 0 disables this.", ) # Adds args related to input data files (preprocessing, numberizing, and # binarizing text files; creating vocab files) preprocess.add_args(parser) # Adds args related to checkpointing. group = parser.add_argument_group("Checkpointing") group.add_argument( "--no-end-of-epoch-checkpoints", action="store_true", help="Disables saving checkpoints at the end of the epoch. " "This differs from --no-save and --no-epoch-checkpoints in that it " "still allows for intra-epoch checkpoints if --save-interval is set.", ) group.add_argument( "--max-checkpoints-kept", default=-1, type=int, metavar="N", help="Keep at most the last N checkpoints file around. " "A value < -1 keeps all. " "When --generate-bleu-eval-avg-checkpoints is used and is > N, the " "number of checkpoints kept around is automatically adjusted " "to allow BLEU to work properly.", ) # Adds args for generating intermediate BLEU eval while training. # generate.add_args() adds args used by both train.py and the standalone # generate binary, while the flags defined here are used only by train.py. generate.add_args(parser) group = parser.add_argument_group("Generation") group.add_argument( "--generate-bleu-eval-per-epoch", action="store_true", help="Whether to generate BLEU score eval after each epoch.", ) group.add_argument( "--generate-bleu-eval-interval", default=0, type=int, metavar="N", help="Does BLEU eval every N batch updates. Note that " "--save-interval also affects this - we can only eval as " "frequently as a checkpoint is written. A value of <= 0 " "disables this.", ) group.add_argument( "--generate-bleu-eval-avg-checkpoints", default=1, type=int, metavar="N", help="Maximum number of last N checkpoints to average over when " "doing BLEU eval. Must be >= 1.", ) group.add_argument( "--continuous-averaging-after-epochs", type=int, default=-1, help=("Average parameter values after each step since previous " "checkpoint, beginning after the specified number of epochs. "), ) return parser
def add_cmdline_args(argparser): """Add command-line arguments specifically for this agent.""" # first we need to add the general torch agent operations TorchAgent.add_cmdline_args(argparser) agent = argparser.add_argument_group('Fairseq Arguments') agent.add_argument( '--seed', default=1, type=int, metavar='N', help='pseudo random number generator seed' ) agent.add_argument( '--skip-generation', default=False, type=bool, metavar='BOOL', help='Skips test time beam search. Much faster if you only need PPL', ) # Dictionary construction stuff. Using the subclass in case we end up # needing any fairseq specific things _FairseqDictionary.add_cmdline_args(argparser) # Optimization and learning rate schedule specific arguments options.add_optimization_args(argparser) known_args = argparser.parse_known_args(nohelp=True)[0] if hasattr(known_args, "optimizer"): optimizer = known_args.optimizer opt_group = argparser.add_argument_group( '{} optimizer arguments'.format(optimizer) ) optim.OPTIMIZER_REGISTRY[optimizer].add_args(opt_group) if hasattr(known_args, "lr_scheduler"): lr_scheduler = known_args.lr_scheduler lr_group = argparser.add_argument_group( '{} scheduler arguments'.format(lr_scheduler) ) optim.lr_scheduler.LR_SCHEDULER_REGISTRY[lr_scheduler].add_args(lr_group) # Generation arguments options.add_generation_args(argparser) # We need to find out the fairseq model-specific options, so grab the # architecture stuff and look up its options arch_group = options.add_model_args(argparser) # Fairseq marks the arch flag as required, but it may be specified # by a saved model cache, so we do some weird stuff to undo that for a in arch_group._actions: if a.dest == "arch": a.required = False a.default = None break known_args = argparser.parse_known_args(nohelp=True)[0] if hasattr(known_args, "arch") and known_args.arch is not None: arch = known_args.arch arch_group = argparser.add_argument_group( "{} architecture arguments".format(arch) ) models.ARCH_MODEL_REGISTRY[arch].add_args(arch_group) # Override a few defaults from within fairseq to more sensible defaults argparser.set_defaults( clip_norm=0.1, adam_betas="(0.9,0.98)" )
def get_parser_with_args(): parser = options.get_parser('Trainer') options.add_dataset_args(parser, train=True, gen=True) options.add_distributed_training_args(parser) options.add_optimization_args(parser) options.add_checkpoint_args(parser) options.add_model_args(parser) options.add_generation_args(parser) parser.add_argument( '--log-verbose', action='store_true', help='Whether to output more verbose logs for debugging/profiling.', ) # Adds args related to training (validation and stopping criterions). group = parser.add_argument_group('Optimization') group.add_argument( '--subepoch-validate-interval', default=0, type=int, metavar='N', help='Calculates loss over the validation set every N batch updates. ' 'Note that validation is done at the end of every epoch regardless. ' 'A value of <= 0 disables this.', ) group.add_argument( '--stop-time-hr', default=-1, type=int, metavar='N', help='Stops training after N hours have elapsed. ' 'A value of < 0 disables this.', ) group.add_argument( '--stop-no-best-validate-loss', default=-1, type=int, metavar='N', help='Stops training after N validations have been run without ' 'achieving a better loss than before. Note that this is affected by ' '--validation-interval in how frequently we run validation in the ' 'first place. A value of < 0 disables this.', ) group.add_argument( '--stop-no-best-bleu-eval', default=-1, type=int, metavar='N', help='Stops training after N evals have been run without ' 'achieving a better BLEU score than before. Note that this is affected ' 'by --generate-bleu-eval-interval in how frequently we run BLEU eval ' 'in the first place. A value of < 0 disables this.', ) # Args related to dataset. group = parser.add_argument_group('Dataset and data loading') group.add_argument( '--source-vocab-file', default='', metavar='FILE', help='Path to text file representing the fairseq Dictionary to use. ' 'If left empty, the dict is auto-generated from source training data.', ) group.add_argument( '--source-max-vocab-size', default=-1, type=int, metavar='N', help='If a new vocab file needs to be generated, restrict it to the ' 'top N most common words. If we re-use an existing vocab file, this ' 'flag will have no effect. A value of < 0 means no max size.', ) group.add_argument( '--target-vocab-file', default='', metavar='FILE', help='Path to text file representing the fairseq Dictionary to use. ' 'If left empty, the dict is auto-generated from target training data.', ) group.add_argument( '--target-max-vocab-size', default=-1, type=int, metavar='N', help='If a new vocab file needs to be generated, restrict it to the ' 'top N most common words. If we re-use an existing vocab file, this ' 'flag will have no effect. A value of < 0 means no max size.', ) group.add_argument( '--train-source-text-file', default='', metavar='FILE', help='Path to raw text file containing source training examples. ' 'This overrides what would be loaded from the data dir.', ) group.add_argument( '--train-target-text-file', default='', metavar='FILE', help='Path to raw text file containing target training examples. ' 'This overrides what would be loaded from the data dir.', ) group.add_argument( '--eval-source-text-file', default='', metavar='FILE', help='Path to raw text file containing source eval examples for ' 'calculating validation loss and BLEU eval scores. ' 'This overrides what would be loaded from the data dir.', ) group.add_argument( '--eval-target-text-file', default='', metavar='FILE', help='Path to raw text file containing target eval examples for ' 'calculating validation loss and BLEU eval scores. ' 'This overrides what would be loaded from the data dir.', ) # Adds args related to checkpointing. group = parser.add_argument_group('Checkpointing') group.add_argument( '--no-end-of-epoch-checkpoints', action='store_true', help='Disables saving checkpoints at the end of the epoch. ' 'This differs from --no-save and --no-epoch-checkpoints in that it ' 'still allows for intra-epoch checkpoints if --save-interval is set.') # Adds args for generating intermediate BLEU eval while training. # generate.add_args() adds args used by both train.py and the standalone # generate binary, while the flags defined here are used only by train.py. generate.add_args(parser) group = parser.add_argument_group('Generation') group.add_argument( '--generate-bleu-eval-per-epoch', action='store_true', help='Whether to generate BLEU score eval after each epoch.', ) group.add_argument( '--generate-bleu-eval-interval', default=0, type=int, metavar='N', help='Does BLEU eval every N batch updates. Note that ' '--save-interval also affects this - we can only eval as ' 'frequently as a checkpoint is written. A value of <= 0 ' 'disables this.', ) group.add_argument( '--generate-bleu-eval-avg-checkpoints', default=1, type=int, metavar='N', help='Maximum number of last N checkpoints to average over when ' 'doing BLEU eval. Must be >= 1.', ) group.add_argument( '--continuous-averaging-after-epochs', type=int, default=-1, help=('Average parameter values after each step since previous ' 'checkpoint, beginning after the specified number of epochs. '), ) return parser
def main(): parser = options.get_parser('Trainer') dataset_args = options.add_dataset_args(parser) dataset_args.add_argument('--max-tokens', default=6000, type=int, metavar='N', help='maximum number of tokens in a batch') dataset_args.add_argument( '--train-subset', default='train', metavar='SPLIT', choices=['train', 'valid', 'test'], help='data subset to use for training (train, valid, test)') dataset_args.add_argument( '--valid-subset', default='valid', metavar='SPLIT', help='comma separated list ofdata subsets ' ' to use for validation (train, valid, valid1,test, test1)') options.add_optimization_args(parser) options.add_checkpoint_args(parser) options.add_model_args(parser) args = utils.parse_args_and_arch(parser) print(args) if args.no_progress_bar: progress_bar.enabled = False progress_bar.print_interval = args.log_interval if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) torch.manual_seed(args.seed) # Load dataset dataset = data.load_with_check(args.data, ['train', 'valid'], args.source_lang, args.target_lang) if args.source_lang is None or args.target_lang is None: # record inferred languages in args, so that it's saved in checkpoints args.source_lang, args.target_lang = dataset.src, dataset.dst print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) for split in ['train', 'valid']: print('| {} {} {} examples'.format(args.data, split, len(dataset.splits[split]))) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') num_gpus = torch.cuda.device_count() print('| using {} GPUs (with max tokens per GPU = {})'.format( num_gpus, args.max_tokens)) # Build model and criterion model = utils.build_model(args, dataset.src_dict, dataset.dst_dict) criterion = utils.build_criterion(args, dataset.src_dict, dataset.dst_dict) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) # Start multiprocessing trainer = MultiprocessingTrainer(args, model, criterion) # Load the latest checkpoint if one is available checkpoint_path = os.path.join(args.save_dir, args.restore_file) extra_state = trainer.load_checkpoint(checkpoint_path) if extra_state is not None: epoch = extra_state['epoch'] batch_offset = extra_state['batch_offset'] print('| loaded checkpoint {} (epoch {})'.format( checkpoint_path, epoch)) if batch_offset == 0: epoch += 1 else: epoch, batch_offset = 1, 0 # Train until the learning rate gets too small val_loss = None max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() while lr > args.min_lr and epoch <= max_epoch: # train for one epoch train(args, epoch, batch_offset, trainer, dataset, num_gpus) # evaluate on validate set for k, subset in enumerate(args.valid_subset.split(',')): val_loss = validate(args, epoch, trainer, dataset, subset, num_gpus) if k == 0: if not args.no_save: # save checkpoint save_checkpoint(trainer, args, epoch, 0, val_loss) # only use first validation loss to update the learning schedule lr = trainer.lr_step(val_loss, epoch) epoch += 1 batch_offset = 0 train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum)) # Stop multiprocessing trainer.stop()
def add_cmdline_args(cls, argparser): """Add command-line arguments specifically for this agent.""" # first we need to add the general torch agent operations TorchAgent.add_cmdline_args(argparser) agent = argparser.add_argument_group('Fairseq Arguments') agent.add_argument('--fp16', default=False, type=bool, help='Use fp16 training') agent.add_argument('--seed', default=1, type=int, metavar='N', help='pseudo random number generator seed') agent.add_argument( '--skip-generation', default=False, type=bool, metavar='BOOL', help= 'Skips test time beam search. Much faster if you only need PPL', ) # Dictionary construction stuff. Using the subclass in case we end up # needing any fairseq specific things cls.dictionary_class().add_cmdline_args(argparser) # Check subargs for generation, optimizers, criterions, archs, etc options.add_generation_args(argparser) options.add_optimization_args(argparser) # make sure we set defaults according to the model before parsing argparser.set_defaults(**cls.DEFAULT_OPTIONS) known_args = argparser.parse_known_args(nohelp=True)[0] if hasattr(known_args, "optimizer"): optimizer = known_args.optimizer opt_group = argparser.add_argument_group( '{} optimizer arguments'.format(optimizer)) optim.OPTIMIZER_REGISTRY[optimizer].add_args(opt_group) if hasattr(known_args, "lr_scheduler"): lr_scheduler = known_args.lr_scheduler lr_group = argparser.add_argument_group( '{} scheduler arguments'.format(lr_scheduler)) optim.lr_scheduler.LR_SCHEDULER_REGISTRY[lr_scheduler].add_args( lr_group) # We need to find out the fairseq model-specific options, so grab the # architecture stuff and look up its options arch_group = options.add_model_args(argparser) # Fairseq marks the arch flag as required, but it may be specified # by a saved model cache, so we do some weird stuff to undo that for a in arch_group._actions: if a.dest == "arch": a.required = False a.default = None break # make sure we set defaults according to parlai model before parsing argparser.set_defaults(**cls.DEFAULT_OPTIONS) known_args = argparser.parse_known_args(nohelp=True)[0] if hasattr(known_args, "arch") and known_args.arch is not None: arch = known_args.arch arch_group = argparser.add_argument_group( "{} architecture arguments".format(arch)) models.ARCH_MODEL_REGISTRY[arch].add_args(arch_group) if hasattr(known_args, "criterion"): crit_group = argparser.add_argument_group( '{} criterion arguments'.format(known_args.criterion)) criterions.CRITERION_REGISTRY[known_args.criterion].add_args( crit_group) # As one final check, let's make sure we set defaults correctly argparser.set_defaults(**cls.DEFAULT_OPTIONS)
def add_cmdline_args(cls, argparser): """Add command-line arguments specifically for this agent.""" # first we need to add the general torch agent operations super(FairseqAgent, cls).add_cmdline_args(argparser) # let's store any defaults that were overridden old_defaults = argparser._defaults if 'clip_norm' not in old_defaults: # fairseq has a few awful defaults old_defaults['clip_norm'] = 1.0 if 'optimizer' not in old_defaults: old_defaults['optimizer'] = 'adam' old_defaults['adam_betas'] = '(0.9,0.98)' agent = argparser.add_argument_group('Fairseq Arguments') agent.add_argument('--fp16', default=False, type='bool', help='Use fp16 training') agent.add_argument( '--fp16-init-scale', default=2**7, type=int, help='default FP16 loss scale', ) agent.add_argument( '--seed', default=1, type=int, metavar='N', help='pseudo random number generator seed', ) agent.add_argument( '--skip-generation', default=False, type='bool', metavar='BOOL', help= 'Skips test time beam search. Much faster if you only need PPL', ) # Check subargs for generation, optimizers, criterions, archs, etc options.add_generation_args(argparser) options.add_optimization_args(argparser) options.add_checkpoint_args(argparser) # restore any user set defaults that fairseq possibly overrode argparser.set_defaults(**old_defaults) known_args = argparser.parse_known_args(nohelp=True)[0] if hasattr(known_args, "optimizer"): optimizer = known_args.optimizer opt_group = argparser.add_argument_group( '{} optimizer arguments'.format(optimizer)) optim.OPTIMIZER_REGISTRY[optimizer].add_args(opt_group) if hasattr(known_args, "lr_scheduler"): lr_scheduler = known_args.lr_scheduler lr_group = argparser.add_argument_group( '{} scheduler arguments'.format(lr_scheduler)) optim.lr_scheduler.LR_SCHEDULER_REGISTRY[lr_scheduler].add_args( lr_group) # We need to find out the fairseq model-specific options, so grab the # architecture stuff and look up its options arch_group = options.add_model_args(argparser) # Fairseq marks the arch flag as required, but it may be specified # by a saved model cache, so we do some weird stuff to undo that for a in arch_group._actions: if a.dest == "arch": a.required = False a.default = None break # once again restore any user-set defaults argparser.set_defaults(**old_defaults) known_args = argparser.parse_known_args(nohelp=True)[0] if hasattr(known_args, "arch") and known_args.arch is not None: arch = known_args.arch arch_group = argparser.add_argument_group( "{} architecture arguments".format(arch)) models.ARCH_MODEL_REGISTRY[arch].add_args(arch_group) if hasattr(known_args, "criterion"): crit_group = argparser.add_argument_group( '{} criterion arguments'.format(known_args.criterion)) criterions.CRITERION_REGISTRY[known_args.criterion].add_args( crit_group) # one last time, restore any user set defaults argparser.set_defaults(**old_defaults)
def main(): parser = options.get_parser('Trainer') dataset_args = options.add_dataset_args(parser) dataset_args.add_argument('--max-tokens', default=6000, type=int, metavar='N', help='maximum number of tokens in a batch') dataset_args.add_argument('--max-sentences', type=int, metavar='N', help='maximum number of sentences in a batch') dataset_args.add_argument( '--train-subset', default='train', metavar='SPLIT', choices=['train', 'valid', 'test'], help='data subset to use for training (train, valid, test)') dataset_args.add_argument( '--valid-subset', default='valid', metavar='SPLIT', help='comma separated list of data subsets ' ' to use for validation (train, valid, valid1,test, test1)') dataset_args.add_argument( '--max-sentences-valid', type=int, metavar='N', help='maximum number of sentences in a validation batch') options.add_optimization_args(parser) options.add_checkpoint_args(parser) options.add_model_args(parser) args = utils.parse_args_and_arch(parser) if args.no_progress_bar and args.log_format is None: args.log_format = 'simple' if args.max_sentences_valid is None: args.max_sentences_valid = args.max_sentences if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) torch.manual_seed(args.seed) # Load dataset splits = ['train', 'valid'] if data.has_binary_files(args.data, splits): dataset = data.load_dataset(args.data, splits, args.source_lang, args.target_lang) else: dataset = data.load_raw_text_dataset(args.data, splits, args.source_lang, args.target_lang) if args.source_lang is None or args.target_lang is None: # record inferred languages in args, so that it's saved in checkpoints args.source_lang, args.target_lang = dataset.src, dataset.dst if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') args.num_gpus = torch.cuda.device_count() print(args) print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) for split in splits: print('| {} {} {} examples'.format(args.data, split, len(dataset.splits[split]))) print( '| using {} GPUs (with max tokens per GPU = {} and max sentences per GPU = {})' .format(args.num_gpus, args.max_tokens, args.max_sentences)) # Build model and criterion model = utils.build_model(args, dataset.src_dict, dataset.dst_dict) criterion = utils.build_criterion(args, dataset.src_dict, dataset.dst_dict) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format( sum(p.data.numel() for p in model.parameters()))) # The max number of positions can be different for train and valid # e.g., RNNs may support more positions at test time than seen in training max_positions_train = (min(args.max_source_positions, model.max_encoder_positions()), min(args.max_target_positions, model.max_decoder_positions())) max_positions_valid = (model.max_encoder_positions(), model.max_decoder_positions()) # Start multiprocessing trainer = MultiprocessingTrainer(args, model, criterion) # Create files to save losses traincsv_path = os.path.join(args.save_dir, 'train_losses.csv') validcsv_path = os.path.join(args.save_dir, 'valid_losses.csv') output_path = [traincsv_path, validcsv_path] for path in output_path: with open(path, 'w+') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',') csvwriter.writerow(['Epoch', 'Perplexity', 'Loss']) csvfile.close() # Load the latest checkpoint if one is available checkpoint_path = os.path.join(args.save_dir, args.restore_file) extra_state = trainer.load_checkpoint(checkpoint_path) if extra_state is not None: epoch = extra_state['epoch'] batch_offset = extra_state['batch_offset'] print('| loaded checkpoint {} (epoch {})'.format( checkpoint_path, epoch)) if batch_offset == 0: epoch += 1 else: epoch, batch_offset = 1, 0 # Train until the learning rate gets too small val_loss = None max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() while lr > args.min_lr and epoch <= max_epoch: # train for one epoch train(args, epoch, batch_offset, trainer, dataset, max_positions_train, traincsv_path) # evaluate on validate set for k, subset in enumerate(args.valid_subset.split(',')): val_loss = validate(args, epoch, trainer, dataset, max_positions_valid, subset, validcsv_path) if k == 0: if not args.no_save: # save checkpoint save_checkpoint(trainer, args, epoch, 0, val_loss) # only use first validation loss to update the learning schedule lr = trainer.lr_step(val_loss, epoch) epoch += 1 batch_offset = 0 train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum)) # Stop multiprocessing trainer.stop()
def predict_length_beam(gold_target_len, predicted_lengths, length_beam_size): if gold_target_len is not None: beam_starts = gold_target_len - (length_beam_size - 1) // 2 beam_ends = gold_target_len + length_beam_size // 2 + 1 beam = torch.stack([torch.arange(beam_starts[batch], beam_ends[batch], device=beam_starts.device) for batch in range(gold_target_len.size(0))], dim=0) else: beam = predicted_lengths.topk(length_beam_size, dim=1)[1] beam[beam < 2] = 2 return beam if __name__ == '__main__': parser = options.get_generation_parser() options.add_model_args(parser) parser.add_argument("--all", action="store_true") parser.add_argument("--stepwise", action="store_true") parser.add_argument("--semiat", action="store_true") parser.add_argument("--scan-checkpoints", action="store_true") parser.add_argument("--checkpoint-name", type=str, default="best") parser.add_argument("--ensemble", action="store_true") parser.add_argument("--end-iteration", default=-1, type=int) args = options.parse_args_and_arch(parser) if args.all: for i in [0, 2, 4, 8, 10]: print("testing with iterations", i) args.decoding_iterations = i main(args) elif args.stepwise: for i in range(args.decoding_iterations):
def main(): parser = options.get_parser('Trainer') dataset_args = options.add_dataset_args(parser) dataset_args.add_argument('--max-tokens', default=6000, type=int, metavar='N', help='maximum number of tokens in a batch') dataset_args.add_argument('--max-sentences', type=int, metavar='N', help='maximum number of sentences in a batch') dataset_args.add_argument('--train-subset', default='train', metavar='SPLIT', choices=['train', 'valid', 'test'], help='data subset to use for training (train, valid, test)') dataset_args.add_argument('--valid-subset', default='valid', metavar='SPLIT', help='comma separated list of data subsets ' ' to use for validation (train, valid, valid1,test, test1)') options.add_optimization_args(parser) options.add_checkpoint_args(parser) options.add_model_args(parser) args = utils.parse_args_and_arch(parser) if args.no_progress_bar and args.log_format is None: args.log_format = 'simple' if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) torch.manual_seed(args.seed) # Load dataset splits = ['train', 'valid'] if data.has_binary_files(args.data, splits): dataset = data.load_dataset(args.data, splits, args.source_lang, args.target_lang) else: dataset = data.load_raw_text_dataset(args.data, splits, args.source_lang, args.target_lang) if args.source_lang is None or args.target_lang is None: # record inferred languages in args, so that it's saved in checkpoints args.source_lang, args.target_lang = dataset.src, dataset.dst print(args) print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) for split in splits: print('| {} {} {} examples'.format(args.data, split, len(dataset.splits[split]))) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') num_gpus = torch.cuda.device_count() print('| using {} GPUs (with max tokens per GPU = {} and max sentences per GPU = {})'.format( num_gpus, args.max_tokens, args.max_sentences)) # Build model and criterion model = utils.build_model(args, dataset.src_dict, dataset.dst_dict) criterion = utils.build_criterion(args, dataset.src_dict, dataset.dst_dict) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) # The max number of positions can be different for train and valid # e.g., RNNs may support more positions at test time than seen in training max_positions_train = (args.max_source_positions, args.max_target_positions) max_positions_valid = ( min(args.max_source_positions, model.max_encoder_positions()), min(args.max_target_positions, model.max_decoder_positions()) ) # Start multiprocessing trainer = MultiprocessingTrainer(args, model, criterion) # Load the latest checkpoint if one is available checkpoint_path = os.path.join(args.save_dir, args.restore_file) extra_state = trainer.load_checkpoint(checkpoint_path) if extra_state is not None: epoch = extra_state['epoch'] batch_offset = extra_state['batch_offset'] print('| loaded checkpoint {} (epoch {})'.format(checkpoint_path, epoch)) if batch_offset == 0: epoch += 1 else: epoch, batch_offset = 1, 0 # Train until the learning rate gets too small val_loss = None max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() while lr > args.min_lr and epoch <= max_epoch: # train for one epoch train(args, epoch, batch_offset, trainer, dataset, max_positions_train, num_gpus) # evaluate on validate set for k, subset in enumerate(args.valid_subset.split(',')): val_loss = validate(args, epoch, trainer, dataset, max_positions_valid, subset, num_gpus) if k == 0: if not args.no_save: # save checkpoint save_checkpoint(trainer, args, epoch, 0, val_loss) # only use first validation loss to update the learning schedule lr = trainer.lr_step(val_loss, epoch) epoch += 1 batch_offset = 0 train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum)) # Stop multiprocessing trainer.stop()
def get_parser_with_args(): parser = options.get_parser("Trainer") options.add_dataset_args(parser, train=True, gen=True) options.add_distributed_training_args(parser) options.add_optimization_args(parser) options.add_checkpoint_args(parser) options.add_model_args(parser) options.add_generation_args(parser) parser.add_argument( "--log-verbose", action="store_true", help="Whether to output more verbose logs for debugging/profiling.", ) # Adds args related to training (validation and stopping criterions). group = parser.add_argument_group("Optimization") group.add_argument( "--subepoch-validate-interval", default=0, type=int, metavar="N", help="Calculates loss over the validation set every N batch updates. " "Note that validation is done at the end of every epoch regardless. " "A value of <= 0 disables this.", ) group.add_argument( "--stop-time-hr", default=-1, type=int, metavar="N", help="Stops training after N hours have elapsed. " "A value of < 0 disables this.", ) group.add_argument( "--stop-no-best-validate-loss", default=-1, type=int, metavar="N", help="Stops training after N validations have been run without " "achieving a better loss than before. Note that this is affected by " "--validation-interval in how frequently we run validation in the " "first place. A value of < 0 disables this.", ) group.add_argument( "--stop-no-best-bleu-eval", default=-1, type=int, metavar="N", help="Stops training after N evals have been run without " "achieving a better BLEU score than before. Note that this is affected " "by --generate-bleu-eval-interval in how frequently we run BLEU eval " "in the first place. A value of < 0 disables this.", ) # Args related to dataset. group = parser.add_argument_group("Dataset and data loading") group.add_argument( "--source-vocab-file", default="", metavar="FILE", help="Path to text file representing the fairseq Dictionary to use. " "If left empty, the dict is auto-generated from source training data.", ) group.add_argument( "--source-max-vocab-size", default=-1, type=int, metavar="N", help="If a new vocab file needs to be generated, restrict it to the " "top N most common words. If we re-use an existing vocab file, this " "flag will have no effect. A value of < 0 means no max size.", ) group.add_argument( "--target-vocab-file", default="", metavar="FILE", help="Path to text file representing the fairseq Dictionary to use. " "If left empty, the dict is auto-generated from target training data.", ) group.add_argument( "--target-max-vocab-size", default=-1, type=int, metavar="N", help="If a new vocab file needs to be generated, restrict it to the " "top N most common words. If we re-use an existing vocab file, this " "flag will have no effect. A value of < 0 means no max size.", ) group.add_argument( "--train-source-text-file", default="", metavar="FILE", help="Path to raw text file containing source training examples. " "This overrides what would be loaded from the data dir.", ) group.add_argument( "--train-target-text-file", default="", metavar="FILE", help="Path to raw text file containing target training examples. " "This overrides what would be loaded from the data dir.", ) group.add_argument( "--eval-source-text-file", default="", metavar="FILE", help="Path to raw text file containing source eval examples for " "calculating validation loss and BLEU eval scores. " "This overrides what would be loaded from the data dir.", ) group.add_argument( "--eval-target-text-file", default="", metavar="FILE", help="Path to raw text file containing target eval examples for " "calculating validation loss and BLEU eval scores. " "This overrides what would be loaded from the data dir.", ) group.add_argument( "--penalized-target-tokens-file", default="", metavar="FILE", help="Path to text file of tokens to receive a penalty in decoding." "If left empty, no penalty will be applied", ) # Adds args related to checkpointing. group = parser.add_argument_group("Checkpointing") group.add_argument( "--no-end-of-epoch-checkpoints", action="store_true", help="Disables saving checkpoints at the end of the epoch. " "This differs from --no-save and --no-epoch-checkpoints in that it " "still allows for intra-epoch checkpoints if --save-interval is set.", ) group.add_argument( "--max-checkpoints-kept", default=-1, type=int, metavar="N", help="Keep at most the last N checkpoints file around. " "A value < -1 keeps all. " "When --generate-bleu-eval-avg-checkpoints is used and is > N, the " "number of checkpoints kept around is automatically adjusted " "to allow BLEU to work properly.", ) # Adds args for generating intermediate BLEU eval while training. # generate.add_args() adds args used by both train.py and the standalone # generate binary, while the flags defined here are used only by train.py. generate.add_args(parser) group = parser.add_argument_group("Generation") group.add_argument( "--generate-bleu-eval-per-epoch", action="store_true", help="Whether to generate BLEU score eval after each epoch.", ) group.add_argument( "--generate-bleu-eval-interval", default=0, type=int, metavar="N", help="Does BLEU eval every N batch updates. Note that " "--save-interval also affects this - we can only eval as " "frequently as a checkpoint is written. A value of <= 0 " "disables this.", ) group.add_argument( "--generate-bleu-eval-avg-checkpoints", default=1, type=int, metavar="N", help="Maximum number of last N checkpoints to average over when " "doing BLEU eval. Must be >= 1.", ) group.add_argument( "--continuous-averaging-after-epochs", type=int, default=-1, help=("Average parameter values after each step since previous " "checkpoint, beginning after the specified number of epochs. "), ) return parser
def cli_main(): parser = options.get_generation_parser(interactive=True) options.add_model_args(parser) args = options.parse_args_and_arch(parser) main(args)