def cli_main(): tmp_parser = options.get_parser('Preprocessing', 'translation') tmp_args, _ = tmp_parser.parse_known_args() parser = options.get_parser('Preprocessing', 'translation') tasks.get_task(tmp_args.task).add_args(parser) options.add_preprocess_args(parser) args = parser.parse_args() main(args)
def get_parser_with_args(): parser = options.get_parser("Collect Top-K Probs", default_task="pytorch_translate") pytorch_translate_options.add_verbosity_args(parser) pytorch_translate_options.add_dataset_args(parser, gen=True) generation_group = options.add_generation_args(parser) generation_group.add_argument( "--source-binary-file", default="", help="Path for the binary file containing source eval examples. " "(Overrides --source-text-file. Must be used in conjunction with " "--target-binary-file).", ) generation_group.add_argument( "--target-binary-file", default="", help="Path for the binary file containing target eval examples. " "(Overrides --target-text-file. Must be used in conjunction with " "--source-binary-file).", ) generation_group.add_argument( "--k-probs-to-collect", type=int, default=8, help="Number of probabilities to collect for each output step.", ) generation_group.add_argument( "--top-k-probs-binary-file", type=str, default="", help="File into which to save top-K probabilities for each token.", ) return parser
def get_parser_with_args(): parser = options.get_parser("Trainer") parser.add_argument( "--log-verbose", action="store_true", help="Whether to output more verbose logs for debugging/profiling.", ) pytorch_translate_options.add_dataset_args(parser, train=True, gen=True) options.add_distributed_training_args(parser) # Adds args related to training (validation and stopping criterions). optimization_group = options.add_optimization_args(parser) pytorch_translate_options.expand_optimization_args(optimization_group) # Adds args related to checkpointing. checkointing_group = options.add_checkpoint_args(parser) pytorch_translate_options.expand_checkpointing_args(checkointing_group) # Add model related args options.add_model_args(parser) # Adds args for generating intermediate BLEU eval while training. generation_group = options.add_generation_args(parser) pytorch_translate_options.expand_generation_args(generation_group, train=True) # Adds args related to input data files (preprocessing, numberizing, and # binarizing text files; creating vocab files) pytorch_translate_options.add_preprocessing_args(parser) return parser
def get_parser_with_args(): parser = options.get_parser('Generation') options.add_dataset_args(parser, gen=True) options.add_generation_args(parser) add_args(parser) group = parser.add_argument_group('Generation') group.add_argument( '--source-vocab-file', default='', metavar='FILE', help='Path to text file representing the Dictionary to use.') group.add_argument( '--target-vocab-file', default='', metavar='FILE', help='Path to text file representing the Dictionary to use.') group.add_argument( '--source-text-file', default='', metavar='FILE', help='Path to raw text file containing examples in source dialect. ' 'This overrides what would be loaded from the data dir.', ) group.add_argument( '--target-text-file', default='', metavar='FILE', help='Path to raw text file containing examples in target dialect. ' 'This overrides what would be loaded from the data dir.', ) return parser
def get_parser_with_args(): parser = options.get_parser("Generation") options.add_dataset_args(parser, gen=True) options.add_generation_args(parser) add_args(parser) group = parser.add_argument_group("Generation") group.add_argument( "--source-vocab-file", default="", metavar="FILE", help="Path to text file representing the Dictionary to use.", ) group.add_argument( "--target-vocab-file", default="", metavar="FILE", help="Path to text file representing the Dictionary to use.", ) group.add_argument( "--source-text-file", default="", metavar="FILE", help="Path to raw text file containing examples in source dialect. " "This overrides what would be loaded from the data dir.", ) group.add_argument( "--target-text-file", default="", metavar="FILE", help="Path to raw text file containing examples in target dialect. " "This overrides what would be loaded from the data dir.", ) return parser
def main(): parser = options.get_parser('Generation') parser.add_argument('--path', metavar='FILE', required=True, action='append', help='path(s) to model file(s)') options.add_dataset_args(parser) options.add_generation_args(parser) args = parser.parse_args() print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load ensemble print('| loading model(s) from {}'.format(', '.join(args.path))) models, model_args = utils.load_ensemble_for_inference(args.path, data_dir=args.data) src_dict, dst_dict = models[0].src_dict, models[0].dst_dict print('| [{}] dictionary: {} types'.format(model_args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format(model_args.target_lang, len(dst_dict))) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam) # Initialize generator translator = SequenceGenerator( models, beam_size=args.beam, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen, unk_penalty=args.unkpen) if use_cuda: translator.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) print('| Type the input sentence and press return:') for src_str in sys.stdin: src_str = src_str.strip() src_tokens = tokenizer.Tokenizer.tokenize(src_str, src_dict, add_if_not_exist=False).long() if use_cuda: src_tokens = src_tokens.cuda() translations = translator.generate(Variable(src_tokens.view(1, -1))) hypos = translations[0] print('O\t{}'.format(src_str)) # Process top predictions for hypo in hypos[:min(len(hypos), args.nbest)]: hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu(), align_dict=align_dict, dst_dict=dst_dict, remove_bpe=args.remove_bpe) print('H\t{}\t{}'.format(hypo['score'], hypo_str)) print('A\t{}'.format(' '.join(map(str, alignment))))
def get_generation_parser(interactive=False, default_task='translation'): parser = get_parser('Generation', default_task) add_dataset_args(parser, gen=True) add_distributed_training_args(parser, default_world_size=1) add_generation_args(parser) add_checkpoint_args(parser) if interactive: add_interactive_args(parser) return parser
def get_training_and_generation_parser(default_task='translation'): parser = options.get_parser('Trainer', default_task) options.add_dataset_args(parser, train=True, gen=True) options.add_generation_args(parser) options.add_distributed_training_args(parser) options.add_model_args(parser) options.add_optimization_args(parser) options.add_checkpoint_args(parser) return parser
def get_parser_with_args(): """Create argument parser with arguments specific to this script""" parser = options.get_parser("Whitebox attack", default_task="pytorch_translate_adversarial") # Data related arguments data_group = pytorch_translate_options.add_dataset_args(parser, gen=True) # Adds args used by the standalone generate binary. data_group.add_argument( "--source-vocab-file", default="", metavar="FILE", help="Path to text file representing the Dictionary to use.", ) data_group.add_argument( "--char-source-vocab-file", default="", metavar="FILE", help=("Same as --source-vocab-file except using characters. " "(For use with char_source models only.)"), ) data_group.add_argument( "--target-vocab-file", default="", metavar="FILE", help="Path to text file representing the Dictionary to use.", ) data_group.add_argument( "--source-text-file", default="", metavar="FILE", help="Path to raw text file containing examples in source dialect. " "This overrides what would be loaded from the data dir. ", ) data_group.add_argument( "--target-text-file", default="", metavar="FILE", help="Path to raw text file containing examples in target dialect. " "This overrides what would be loaded from the data dir.", ) data_group.add_argument( "--adversarial-output-file", default="", type=str, metavar="FILE", help="Path to text file to store the generated adversarial examples.", ) # Adversarial attack specific group adversarial_options.add_adversarial_args(parser, attack_only=True) return parser
def get_parser_with_args(): parser = options.get_parser("Generation") options.add_dataset_args(parser, gen=True) options.add_generation_args(parser) pytorch_translate_generate.add_args(parser) group = parser.add_argument_group("Generation") group.add_argument( "--source-vocab-file", default="", metavar="FILE", help="Path to text file representing the Dictionary to use.", ) group.add_argument( "--target-vocab-file", default="", metavar="FILE", help="Path to text file representing the Dictionary to use.", ) # Add args related to benchmarking. group = parser.add_argument_group("Benchmarking") group.add_argument( "--increment", default=5, type=int, help="Difference in lengths between synthesized sentences. " "Must be integer >=1.", ) group.add_argument( "--max-length", default=100, type=int, help="Maximum allowed length for synthesized sentences. " "Should be greater than --increment.", ) group.add_argument( "--samples-per-length", default=1, type=int, help="Number of sentences to be synthesized at each length. ", ) return parser
def get_preprocessing_parser(): parser = get_parser("Preprocessing", "translation") group = parser.add_argument_group("Preprocessing") # fmt: off parser.add_argument('--format', metavar='INP', help='Input format for audio files') group.add_argument("--trainpref", metavar="FP", default=None, help="train file prefix") group.add_argument("--validpref", metavar="FP", default=None, help="comma separated, valid file prefixes") group.add_argument("--testpref", metavar="FP", default=None, help="comma separated, test file prefixes") group.add_argument("--destdir", metavar="DIR", default="data-bin", help="destination dir") parser.add_argument('--legacy-audio-fix-lua-indexing', action='store_true', default=False, help='if set, the input filterbanks are added 1 for compatibility with lua indexing fix') # TODO: add parallel implementation # fmt: on return parser
def get_parser_with_args(default_task="pytorch_translate"): parser = options.get_parser("Trainer", default_task=default_task) pytorch_translate_options.add_verbosity_args(parser, train=True) pytorch_translate_options.add_dataset_args(parser, train=True, gen=True) options.add_distributed_training_args(parser) # Adds args related to training (validation and stopping criterions). optimization_group = options.add_optimization_args(parser) pytorch_translate_options.expand_optimization_args(optimization_group) # Adds args related to checkpointing. checkpointing_group = options.add_checkpoint_args(parser) pytorch_translate_options.expand_checkpointing_args(checkpointing_group) # Add model related args options.add_model_args(parser) # Adds args for generating intermediate BLEU eval while training. generation_group = options.add_generation_args(parser) pytorch_translate_options.expand_generation_args(generation_group, train=True) # Adds args related to input data files (preprocessing, numberizing, and # binarizing text files; creating vocab files) pytorch_translate_options.add_preprocessing_args(parser) return parser
def get_rendering_parser(default_task="single_object_rendering"): parser = options.get_parser("Rendering", default_task) options.add_dataset_args(parser, gen=True) add_rendering_args(parser) return parser
for arg in vars(parsed_args).keys(): setattr(args, arg, getattr(parsed_args, arg)) task = tasks.setup_task(args) # Load dataset splits task.load_dataset(args.gen_subset) print('| {} {} {} examples'.format(args.data, args.gen_subset, len(task.dataset(args.gen_subset)))) eval_dataset(task, model, task.dataset(args.gen_subset), args.out_file, args.thresholds, args.compute_metrics, use_cuda) if __name__ == '__main__': parser = options.get_parser('Evaluate Single Sentence Classifier', 'sentence_classification') options.add_common_eval_args(parser) options.add_dataset_args(parser, gen=True) parser.add_argument('--out-file', type=str, help='output filename') parser.add_argument('--thresholds', nargs='+', type=float, help='thresholds to try or use') parser.add_argument( '--compute-metrics', action='store_true', help='if set, uses the labels to compute metrics for each threshold') args = options.parse_args_and_arch(parser) main(args)
def main(): parser = options.get_parser('Trainer') dataset_args = options.add_dataset_args(parser) dataset_args.add_argument('--max-tokens', default=6000, type=int, metavar='N', help='maximum number of tokens in a batch') dataset_args.add_argument( '--train-subset', default='train', metavar='SPLIT', choices=['train', 'valid', 'test'], help='data subset to use for training (train, valid, test)') dataset_args.add_argument( '--valid-subset', default='valid', metavar='SPLIT', help='comma separated list ofdata subsets ' ' to use for validation (train, valid, valid1,test, test1)') options.add_optimization_args(parser) options.add_checkpoint_args(parser) options.add_model_args(parser) args = utils.parse_args_and_arch(parser) print(args) if args.no_progress_bar: progress_bar.enabled = False progress_bar.print_interval = args.log_interval if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) torch.manual_seed(args.seed) # Load dataset dataset = data.load_with_check(args.data, ['train', 'valid'], args.source_lang, args.target_lang) if args.source_lang is None or args.target_lang is None: # record inferred languages in args, so that it's saved in checkpoints args.source_lang, args.target_lang = dataset.src, dataset.dst print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) for split in ['train', 'valid']: print('| {} {} {} examples'.format(args.data, split, len(dataset.splits[split]))) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') num_gpus = torch.cuda.device_count() print('| using {} GPUs (with max tokens per GPU = {})'.format( num_gpus, args.max_tokens)) # Build model and criterion model = utils.build_model(args, dataset.src_dict, dataset.dst_dict) criterion = utils.build_criterion(args, dataset.src_dict, dataset.dst_dict) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) # Start multiprocessing trainer = MultiprocessingTrainer(args, model, criterion) # Load the latest checkpoint if one is available checkpoint_path = os.path.join(args.save_dir, args.restore_file) extra_state = trainer.load_checkpoint(checkpoint_path) if extra_state is not None: epoch = extra_state['epoch'] batch_offset = extra_state['batch_offset'] print('| loaded checkpoint {} (epoch {})'.format( checkpoint_path, epoch)) if batch_offset == 0: epoch += 1 else: epoch, batch_offset = 1, 0 # Train until the learning rate gets too small val_loss = None max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() while lr > args.min_lr and epoch <= max_epoch: # train for one epoch train(args, epoch, batch_offset, trainer, dataset, num_gpus) # evaluate on validate set for k, subset in enumerate(args.valid_subset.split(',')): val_loss = validate(args, epoch, trainer, dataset, subset, num_gpus) if k == 0: if not args.no_save: # save checkpoint save_checkpoint(trainer, args, epoch, 0, val_loss) # only use first validation loss to update the learning schedule lr = trainer.lr_step(val_loss, epoch) epoch += 1 batch_offset = 0 train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum)) # Stop multiprocessing trainer.stop()
def get_parser_with_args(): parser = options.get_parser("Generation", default_task="pytorch_translate") pytorch_translate_options.add_verbosity_args(parser) pytorch_translate_options.add_dataset_args(parser, gen=True) generation_group = options.add_generation_args(parser) pytorch_translate_options.expand_generation_args(generation_group) generation_group.add_argument( "--source-vocab-file", default="", metavar="FILE", help="Path to text file representing the Dictionary to use.", ) generation_group.add_argument( "--char-source-vocab-file", default="", metavar="FILE", help=( "Same as --source-vocab-file except using characters. " "(For use with char_source models only.)" ), ) generation_group.add_argument( "--target-vocab-file", default="", metavar="FILE", help="Path to text file representing the Dictionary to use.", ) generation_group.add_argument( "--multiling-source-lang", action="append", metavar="SRC", help=( "Must be set for decoding with multilingual models. " "Must match an entry from --multiling-encoder-lang from training." ), ) generation_group.add_argument( "--multiling-target-lang", action="append", metavar="TARGET", help=( "Must be set for decoding with multilingual models. " "Must match an entry from --multiling-decoder-lang from training." ), ) # Add args related to benchmarking. group = parser.add_argument_group("Benchmarking") group.add_argument( "--runs-per-length", default=10, type=int, help="Number of times to run generation on each length.", ) group.add_argument( "--examples-per-length", default=1, type=int, help="Sentences of each length to include in each eval (batched if >1).", ) return parser
def get_tuning_parser(default_task='translation'): parser = options.get_parser('Reranking tuning', default_task) add_reranking_args(parser) add_tuning_args(parser) return parser
def get_reranking_parser(default_task='translation'): parser = options.get_parser('Generation and reranking', default_task) add_reranking_args(parser) return parser
def main(): parser = options.get_parser('Generation') parser.add_argument('--path', metavar='FILE', required=True, action='append', help='path(s) to model file(s)') dataset_args = options.add_dataset_args(parser) dataset_args.add_argument('--batch-size', default=32, type=int, metavar='N', help='batch size') dataset_args.add_argument( '--gen-subset', default='test', metavar='SPLIT', help='data subset to generate (train, valid, test)') dataset_args.add_argument('--num-shards', default=1, type=int, metavar='N', help='shard generation over N shards') dataset_args.add_argument( '--shard-id', default=0, type=int, metavar='ID', help='id of the shard to generate (id < num_shards)') options.add_generation_args(parser) args = parser.parse_args() if args.no_progress_bar and args.log_format is None: args.log_format = 'none' # print(args) use_cuda = torch.cuda.is_available() and not args.cpu if hasattr(torch, 'set_grad_enabled'): torch.set_grad_enabled(False) # Load dataset if args.replace_unk is None: dataset = data.load_dataset(args.data, [args.gen_subset], args.source_lang, args.target_lang) else: dataset = data.load_raw_text_dataset(args.data, [args.gen_subset], args.source_lang, args.target_lang) if args.source_lang is None or args.target_lang is None: # record inferred languages in args args.source_lang, args.target_lang = dataset.src, dataset.dst # Load ensemble # print('| loading model(s) from {}'.format(', '.join(args.path))) models, _ = utils.load_ensemble_for_inference(args.path, dataset.src_dict, dataset.dst_dict) # print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) # print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) # print('| {} {} {} examples'.format(args.data, args.gen_subset, len(dataset.splits[args.gen_subset]))) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam) # Initialize generator translator = SequenceGenerator(models, beam_size=args.beam, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen, unk_penalty=args.unkpen) if use_cuda: translator.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Generate and compute BLEU score #scorer = bleu.Scorer(dataset.dst_dict.pad(), dataset.dst_dict.eos(), dataset.dst_dict.unk()) max_positions = min(model.max_encoder_positions() for model in models) itr = dataset.eval_dataloader(args.gen_subset, max_sentences=args.batch_size, max_positions=max_positions, skip_invalid_size_inputs_valid_test=args. skip_invalid_size_inputs_valid_test) if args.num_shards > 1: if args.shard_id < 0 or args.shard_id >= args.num_shards: raise ValueError('--shard-id must be between 0 and num_shards') itr = data.sharded_iterator(itr, args.num_shards, args.shard_id) num_sentences = 0 with utils.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() gen_timer = StopwatchMeter() translations = translator.generate_batched_itr( t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda_device=0 if use_cuda else None, timer=gen_timer) correct = 0 total = 0 for sample_id, src_tokens, target_tokens, hypos in translations: # Process input and ground truth target_tokens = target_tokens.int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = dataset.splits[ args.gen_subset].src.get_original_text(sample_id) target_str = dataset.splits[ args.gen_subset].dst.get_original_text(sample_id) else: src_str = dataset.src_dict.string(src_tokens, args.remove_bpe) target_str = dataset.dst_dict.string(target_tokens, args.remove_bpe, escape_unk=True) # if not args.quiet: # print('S-{}\t{}'.format(sample_id, src_str)) # print('T-{}\t{}'.format(sample_id, target_str)) total += 1 # Process top predictions for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu(), align_dict=align_dict, dst_dict=dataset.dst_dict, remove_bpe=args.remove_bpe) #if src_str == 'walk around right thrice after jump opposite left twice': # import pdb; pdb.set_trace() # if not args.quiet: # print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], hypo_str)) # print('A-{}\t{}'.format(sample_id, ' '.join(map(str, alignment)))) # Score only the top hypothesis if i == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tokenizer.Tokenizer.tokenize( target_str, dataset.dst_dict, add_if_not_exist=True) #scorer.add(target_tokens, hypo_tokens) mat = '' for row in hypo['attention']: for column in row: mat += str(column) + '\t' mat += '\n' tar = '/' + target_str tra = '=' + str(target_str == hypo_str) to_write.write(mat) to_write.write(src_str) to_write.write('\n') to_write.write(hypo_str) to_write.write('\n') to_write.write(tar) to_write.write('\n') to_write.write(tra) to_write.write('\n') to_write.write('-----------') to_write.write('\n') if hypo_str == target_str: correct += 1 wps_meter.update(src_tokens.size(0)) t.log({'wps': round(wps_meter.avg)}) num_sentences += 1 print('| Correct : {} - Total: {}. Accuracy: {:.5f}'.format( correct, total, correct / total))
def get_tuning_parser(default_task="translation"): parser = options.get_parser("Reranking tuning", default_task) add_reranking_args(parser) add_tuning_args(parser) return parser
def get_parser_with_args(): parser = options.get_parser('Trainer') options.add_dataset_args(parser, train=True, gen=True) options.add_distributed_training_args(parser) options.add_optimization_args(parser) options.add_checkpoint_args(parser) options.add_model_args(parser) options.add_generation_args(parser) parser.add_argument( '--log-verbose', action='store_true', help='Whether to output more verbose logs for debugging/profiling.', ) # Adds args related to training (validation and stopping criterions). group = parser.add_argument_group('Optimization') group.add_argument( '--subepoch-validate-interval', default=0, type=int, metavar='N', help='Calculates loss over the validation set every N batch updates. ' 'Note that validation is done at the end of every epoch regardless. ' 'A value of <= 0 disables this.', ) group.add_argument( '--stop-time-hr', default=-1, type=int, metavar='N', help='Stops training after N hours have elapsed. ' 'A value of < 0 disables this.', ) group.add_argument( '--stop-no-best-validate-loss', default=-1, type=int, metavar='N', help='Stops training after N validations have been run without ' 'achieving a better loss than before. Note that this is affected by ' '--validation-interval in how frequently we run validation in the ' 'first place. A value of < 0 disables this.', ) group.add_argument( '--stop-no-best-bleu-eval', default=-1, type=int, metavar='N', help='Stops training after N evals have been run without ' 'achieving a better BLEU score than before. Note that this is affected ' 'by --generate-bleu-eval-interval in how frequently we run BLEU eval ' 'in the first place. A value of < 0 disables this.', ) # Args related to dataset. group = parser.add_argument_group('Dataset and data loading') group.add_argument( '--source-vocab-file', default='', metavar='FILE', help='Path to text file representing the fairseq Dictionary to use. ' 'If left empty, the dict is auto-generated from source training data.', ) group.add_argument( '--source-max-vocab-size', default=-1, type=int, metavar='N', help='If a new vocab file needs to be generated, restrict it to the ' 'top N most common words. If we re-use an existing vocab file, this ' 'flag will have no effect. A value of < 0 means no max size.', ) group.add_argument( '--target-vocab-file', default='', metavar='FILE', help='Path to text file representing the fairseq Dictionary to use. ' 'If left empty, the dict is auto-generated from target training data.', ) group.add_argument( '--target-max-vocab-size', default=-1, type=int, metavar='N', help='If a new vocab file needs to be generated, restrict it to the ' 'top N most common words. If we re-use an existing vocab file, this ' 'flag will have no effect. A value of < 0 means no max size.', ) group.add_argument( '--train-source-text-file', default='', metavar='FILE', help='Path to raw text file containing source training examples. ' 'This overrides what would be loaded from the data dir.', ) group.add_argument( '--train-target-text-file', default='', metavar='FILE', help='Path to raw text file containing target training examples. ' 'This overrides what would be loaded from the data dir.', ) group.add_argument( '--eval-source-text-file', default='', metavar='FILE', help='Path to raw text file containing source eval examples for ' 'calculating validation loss and BLEU eval scores. ' 'This overrides what would be loaded from the data dir.', ) group.add_argument( '--eval-target-text-file', default='', metavar='FILE', help='Path to raw text file containing target eval examples for ' 'calculating validation loss and BLEU eval scores. ' 'This overrides what would be loaded from the data dir.', ) # Adds args related to checkpointing. group = parser.add_argument_group('Checkpointing') group.add_argument( '--no-end-of-epoch-checkpoints', action='store_true', help='Disables saving checkpoints at the end of the epoch. ' 'This differs from --no-save and --no-epoch-checkpoints in that it ' 'still allows for intra-epoch checkpoints if --save-interval is set.') # Adds args for generating intermediate BLEU eval while training. # generate.add_args() adds args used by both train.py and the standalone # generate binary, while the flags defined here are used only by train.py. generate.add_args(parser) group = parser.add_argument_group('Generation') group.add_argument( '--generate-bleu-eval-per-epoch', action='store_true', help='Whether to generate BLEU score eval after each epoch.', ) group.add_argument( '--generate-bleu-eval-interval', default=0, type=int, metavar='N', help='Does BLEU eval every N batch updates. Note that ' '--save-interval also affects this - we can only eval as ' 'frequently as a checkpoint is written. A value of <= 0 ' 'disables this.', ) group.add_argument( '--generate-bleu-eval-avg-checkpoints', default=1, type=int, metavar='N', help='Maximum number of last N checkpoints to average over when ' 'doing BLEU eval. Must be >= 1.', ) group.add_argument( '--continuous-averaging-after-epochs', type=int, default=-1, help=('Average parameter values after each step since previous ' 'checkpoint, beginning after the specified number of epochs. '), ) return parser
def get_parser_with_args(): parser = options.get_parser("Trainer") options.add_dataset_args(parser, train=True, gen=True) options.add_distributed_training_args(parser) options.add_optimization_args(parser) options.add_checkpoint_args(parser) options.add_model_args(parser) options.add_generation_args(parser) parser.add_argument( "--log-verbose", action="store_true", help="Whether to output more verbose logs for debugging/profiling.", ) # Adds args related to training (validation and stopping criterions). group = parser.add_argument_group("Optimization") group.add_argument( "--subepoch-validate-interval", default=0, type=int, metavar="N", help="Calculates loss over the validation set every N batch updates. " "Note that validation is done at the end of every epoch regardless. " "A value of <= 0 disables this.", ) group.add_argument( "--stop-time-hr", default=-1, type=int, metavar="N", help="Stops training after N hours have elapsed. " "A value of < 0 disables this.", ) group.add_argument( "--stop-no-best-validate-loss", default=-1, type=int, metavar="N", help="Stops training after N validations have been run without " "achieving a better loss than before. Note that this is affected by " "--validation-interval in how frequently we run validation in the " "first place. A value of < 0 disables this.", ) group.add_argument( "--stop-no-best-bleu-eval", default=-1, type=int, metavar="N", help="Stops training after N evals have been run without " "achieving a better BLEU score than before. Note that this is affected " "by --generate-bleu-eval-interval in how frequently we run BLEU eval " "in the first place. A value of < 0 disables this.", ) # Adds args related to input data files (preprocessing, numberizing, and # binarizing text files; creating vocab files) preprocess.add_args(parser) # Adds args related to checkpointing. group = parser.add_argument_group("Checkpointing") group.add_argument( "--no-end-of-epoch-checkpoints", action="store_true", help="Disables saving checkpoints at the end of the epoch. " "This differs from --no-save and --no-epoch-checkpoints in that it " "still allows for intra-epoch checkpoints if --save-interval is set.", ) group.add_argument( "--max-checkpoints-kept", default=-1, type=int, metavar="N", help="Keep at most the last N checkpoints file around. " "A value < -1 keeps all. " "When --generate-bleu-eval-avg-checkpoints is used and is > N, the " "number of checkpoints kept around is automatically adjusted " "to allow BLEU to work properly.", ) # Adds args for generating intermediate BLEU eval while training. # generate.add_args() adds args used by both train.py and the standalone # generate binary, while the flags defined here are used only by train.py. generate.add_args(parser) group = parser.add_argument_group("Generation") group.add_argument( "--generate-bleu-eval-per-epoch", action="store_true", help="Whether to generate BLEU score eval after each epoch.", ) group.add_argument( "--generate-bleu-eval-interval", default=0, type=int, metavar="N", help="Does BLEU eval every N batch updates. Note that " "--save-interval also affects this - we can only eval as " "frequently as a checkpoint is written. A value of <= 0 " "disables this.", ) group.add_argument( "--generate-bleu-eval-avg-checkpoints", default=1, type=int, metavar="N", help="Maximum number of last N checkpoints to average over when " "doing BLEU eval. Must be >= 1.", ) group.add_argument( "--continuous-averaging-after-epochs", type=int, default=-1, help=("Average parameter values after each step since previous " "checkpoint, beginning after the specified number of epochs. "), ) return parser
def main(): parser = options.get_parser('Generation') parser.add_argument('--path', metavar='FILE', required=True, action='append', help='path(s) to model file(s)') options.add_dataset_args(parser) options.add_generation_args(parser) args = parser.parse_args() print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load ensemble print('| loading model(s) from {}'.format(', '.join(args.path))) models, model_args = utils.load_ensemble_for_inference(args.path, data_dir=args.data) src_dict, dst_dict = models[0].src_dict, models[0].dst_dict print('| [{}] dictionary: {} types'.format(model_args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format(model_args.target_lang, len(dst_dict))) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam) # Initialize generator translator = SequenceGenerator(models, beam_size=args.beam, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen, unk_penalty=args.unkpen) if use_cuda: translator.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) print('| Type the input sentence and press return:') for src_str in sys.stdin: src_str = src_str.strip() src_tokens = tokenizer.Tokenizer.tokenize( src_str, src_dict, add_if_not_exist=False).long() if use_cuda: src_tokens = src_tokens.cuda() translations = translator.generate(Variable(src_tokens.view(1, -1))) hypos = translations[0] print('O\t{}'.format(src_str)) # Process top predictions for hypo in hypos[:min(len(hypos), args.nbest)]: hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu(), align_dict=align_dict, dst_dict=dst_dict, remove_bpe=args.remove_bpe) print('H\t{}\t{}'.format(hypo['score'], hypo_str)) print('A\t{}'.format(' '.join(map(str, alignment))))
def main(): parser = options.get_parser('Trainer') dataset_args = options.add_dataset_args(parser) dataset_args.add_argument('--max-tokens', default=6000, type=int, metavar='N', help='maximum number of tokens in a batch') dataset_args.add_argument('--max-sentences', type=int, metavar='N', help='maximum number of sentences in a batch') dataset_args.add_argument( '--train-subset', default='train', metavar='SPLIT', choices=['train', 'valid', 'test'], help='data subset to use for training (train, valid, test)') dataset_args.add_argument( '--valid-subset', default='valid', metavar='SPLIT', help='comma separated list of data subsets ' ' to use for validation (train, valid, valid1,test, test1)') dataset_args.add_argument( '--max-sentences-valid', type=int, metavar='N', help='maximum number of sentences in a validation batch') options.add_optimization_args(parser) options.add_checkpoint_args(parser) options.add_model_args(parser) args = utils.parse_args_and_arch(parser) if args.no_progress_bar and args.log_format is None: args.log_format = 'simple' if args.max_sentences_valid is None: args.max_sentences_valid = args.max_sentences if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) torch.manual_seed(args.seed) # Load dataset splits = ['train', 'valid'] if data.has_binary_files(args.data, splits): dataset = data.load_dataset(args.data, splits, args.source_lang, args.target_lang) else: dataset = data.load_raw_text_dataset(args.data, splits, args.source_lang, args.target_lang) if args.source_lang is None or args.target_lang is None: # record inferred languages in args, so that it's saved in checkpoints args.source_lang, args.target_lang = dataset.src, dataset.dst if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') args.num_gpus = torch.cuda.device_count() print(args) print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) for split in splits: print('| {} {} {} examples'.format(args.data, split, len(dataset.splits[split]))) print( '| using {} GPUs (with max tokens per GPU = {} and max sentences per GPU = {})' .format(args.num_gpus, args.max_tokens, args.max_sentences)) # Build model and criterion model = utils.build_model(args, dataset.src_dict, dataset.dst_dict) criterion = utils.build_criterion(args, dataset.src_dict, dataset.dst_dict) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format( sum(p.data.numel() for p in model.parameters()))) # The max number of positions can be different for train and valid # e.g., RNNs may support more positions at test time than seen in training max_positions_train = (min(args.max_source_positions, model.max_encoder_positions()), min(args.max_target_positions, model.max_decoder_positions())) max_positions_valid = (model.max_encoder_positions(), model.max_decoder_positions()) # Start multiprocessing trainer = MultiprocessingTrainer(args, model, criterion) # Create files to save losses traincsv_path = os.path.join(args.save_dir, 'train_losses.csv') validcsv_path = os.path.join(args.save_dir, 'valid_losses.csv') output_path = [traincsv_path, validcsv_path] for path in output_path: with open(path, 'w+') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',') csvwriter.writerow(['Epoch', 'Perplexity', 'Loss']) csvfile.close() # Load the latest checkpoint if one is available checkpoint_path = os.path.join(args.save_dir, args.restore_file) extra_state = trainer.load_checkpoint(checkpoint_path) if extra_state is not None: epoch = extra_state['epoch'] batch_offset = extra_state['batch_offset'] print('| loaded checkpoint {} (epoch {})'.format( checkpoint_path, epoch)) if batch_offset == 0: epoch += 1 else: epoch, batch_offset = 1, 0 # Train until the learning rate gets too small val_loss = None max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() while lr > args.min_lr and epoch <= max_epoch: # train for one epoch train(args, epoch, batch_offset, trainer, dataset, max_positions_train, traincsv_path) # evaluate on validate set for k, subset in enumerate(args.valid_subset.split(',')): val_loss = validate(args, epoch, trainer, dataset, max_positions_valid, subset, validcsv_path) if k == 0: if not args.no_save: # save checkpoint save_checkpoint(trainer, args, epoch, 0, val_loss) # only use first validation loss to update the learning schedule lr = trainer.lr_step(val_loss, epoch) epoch += 1 batch_offset = 0 train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum)) # Stop multiprocessing trainer.stop()
def get_reranking_parser(default_task="translation"): parser = options.get_parser("Generation and reranking", default_task) add_reranking_args(parser) return parser
def get_parser_with_args(): parser = options.get_parser("Generation") pytorch_translate_options.add_verbosity_args(parser) pytorch_translate_options.add_dataset_args(parser, gen=True) generation_group = options.add_generation_args(parser) pytorch_translate_options.expand_generation_args(generation_group) # Adds args used by the standalone generate binary. generation_group.add_argument( "--source-vocab-file", default="", metavar="FILE", help="Path to text file representing the Dictionary to use.", ) generation_group.add_argument( "--char-source-vocab-file", default="", metavar="FILE", help=( "Same as --source-vocab-file except using characters. " "(For use with char_source models only.)" ), ) generation_group.add_argument( "--target-vocab-file", default="", metavar="FILE", help="Path to text file representing the Dictionary to use.", ) generation_group.add_argument( "--source-text-file", default="", nargs="+", metavar="FILE", help="Path to raw text file containing examples in source dialect. " "This overrides what would be loaded from the data dir. " "You can specify multiple source files (eg. for use in combination " "with --source-ensembling). By default this will only translate the " "first source file", ) generation_group.add_argument( "--target-text-file", default="", metavar="FILE", help="Path to raw text file containing examples in target dialect. " "This overrides what would be loaded from the data dir.", ) generation_group.add_argument( "--source-binary-file", default="", help="Path for the binary file containing source eval examples. " "(Overrides --source-text-file. Must be used in conjunction with " "--target-binary-file).", ) generation_group.add_argument( "--target-binary-file", default="", help="Path for the binary file containing target eval examples. " "(Overrides --target-text-file. Must be used in conjunction with " "--source-binary-file).", ) generation_group.add_argument( "--translation-output-file", default="", type=str, metavar="FILE", help="Path to text file to store the output of the model. ", ) generation_group.add_argument( "--translation-probs-file", default="", type=str, metavar="FILE", help="Path to text file to store the probs of translation output. ", ) generation_group.add_argument( "--multiling-source-lang-id", type=int, default=None, help=( "Must be set for decoding with multilingual models. Set to i if " "the source language is the i-th language in the training parameter " "--multiling-encoder-lang (0-indexed)" ), ) generation_group.add_argument( "--multiling-target-lang-id", type=int, default=None, help=( "Must be set for decoding with multilingual models. Set to i if " "the target language is the i-th language in the training parameter " "--multiling-decoder-lang (0-indexed)" ), ) generation_group.add_argument( "--source-ensembling", action="store_true", help="If this flag is present, the model will ensemble the predictions " "conditioned on multiple source sentences (one per source-text-file)", ) return parser
def main(): parser = options.get_parser('Generation') parser.add_argument('--path', metavar='FILE', required=True, action='append', help='path(s) to model file(s)') dataset_args = options.add_dataset_args(parser) dataset_args.add_argument('-i', '--interactive', action='store_true', help='generate translations in interactive mode') dataset_args.add_argument('--batch-size', default=32, type=int, metavar='N', help='batch size') dataset_args.add_argument('--gen-subset', default='test', metavar='SPLIT', help='data subset to generate (train, valid, test)') options.add_generation_args(parser) args = parser.parse_args() print(args) if args.no_progress_bar: progress_bar.enabled = False use_cuda = torch.cuda.is_available() and not args.cpu # Load model and dataset print('| loading model(s) from {}'.format(', '.join(args.path))) models, dataset = utils.load_ensemble_for_inference(args.path, args.data) print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) if not args.interactive: print('| {} {} {} examples'.format(args.data, args.gen_subset, len(dataset.splits[args.gen_subset]))) # Optimize model for generation for model in models: model.make_generation_fast_(not args.no_beamable_mm) # Initialize generator translator = SequenceGenerator(models, dataset.dst_dict, beam_size=args.beam, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen) align_dict = {} if args.unk_replace_dict != '': assert args.interactive, "Unkown words replacing requires access to original source and is only" \ "supported in interactive mode" with open(args.unk_replace_dict, 'r') as f: for line in f: l = line.split() align_dict[l[0]] = l[1] def replace_unk(hypo_str, align_str, src, unk): hypo_tokens = hypo_str.split() src_tokens = tokenizer.tokenize_line(src) align_idx = [int(i) for i in align_str.split()] for i, ht in enumerate(hypo_tokens): if ht == unk: src_token = src_tokens[align_idx[i]] if src_token in align_dict: hypo_tokens[i] = align_dict[src_token] else: hypo_tokens[i] = src_token return ' '.join(hypo_tokens) if use_cuda: translator.cuda() bpe_symbol = '@@ ' if args.remove_bpe else None def display_hypotheses(id, src, orig, ref, hypos): id_str = '' if id is None else '-{}'.format(id) src_str = to_sentence(dataset.src_dict, src, bpe_symbol) print('S{}\t{}'.format(id_str, src_str)) if orig is not None: print('O{}\t{}'.format(id_str, orig.strip())) if ref is not None: print('T{}\t{}'.format(id_str, to_sentence(dataset.dst_dict, ref, bpe_symbol, ref_unk=True))) for hypo in hypos: hypo_str = to_sentence(dataset.dst_dict, hypo['tokens'], bpe_symbol) align_str = ' '.join(map(str, hypo['alignment'])) if args.unk_replace_dict != '': hypo_str = replace_unk(hypo_str, align_str, orig, unk_symbol(dataset.dst_dict)) print('H{}\t{}\t{}'.format( id_str, hypo['score'], hypo_str)) print('A{}\t{}'.format(id_str, align_str)) if args.interactive: for line in sys.stdin: tokens = tokenizer.Tokenizer.tokenize(line, dataset.src_dict, add_if_not_exist=False).long() start = dataset.src_dict.pad() + 1 positions = torch.arange(start, start + len(tokens)).type_as(tokens) if use_cuda: positions = positions.cuda() tokens = tokens.cuda() translations = translator.generate(Variable(tokens.view(1, -1)), Variable(positions.view(1, -1))) hypos = translations[0] display_hypotheses(None, tokens, line, None, hypos[:min(len(hypos), args.nbest)]) else: def maybe_remove_bpe(tokens): """Helper for removing BPE symbols from a hypothesis.""" if not args.remove_bpe: return tokens assert (tokens == dataset.dst_dict.pad()).sum() == 0 hypo_minus_bpe = to_sentence(dataset.dst_dict, tokens, bpe_symbol) return tokenizer.Tokenizer.tokenize(hypo_minus_bpe, dataset.dst_dict, add_if_not_exist=True) # Generate and compute BLEU score scorer = bleu.Scorer(dataset.dst_dict.pad(), dataset.dst_dict.eos(), dataset.dst_dict.unk()) itr = dataset.dataloader(args.gen_subset, batch_size=args.batch_size, max_positions=args.max_positions) num_sentences = 0 with progress_bar(itr, smoothing=0, leave=False) as t: wps_meter = TimeMeter() gen_timer = StopwatchMeter() translations = translator.generate_batched_itr( t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda_device=0 if use_cuda else None, timer=gen_timer) for id, src, ref, hypos in translations: ref = ref.int().cpu() top_hypo = hypos[0]['tokens'].int().cpu() scorer.add(maybe_remove_bpe(ref), maybe_remove_bpe(top_hypo)) display_hypotheses(id, src, None, ref, hypos[:min(len(hypos), args.nbest)]) wps_meter.update(src.size(0)) t.set_postfix(wps='{:5d}'.format(round(wps_meter.avg))) num_sentences += 1 print('| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} tokens/s)'.format( num_sentences, gen_timer.n, gen_timer.sum, 1. / gen_timer.avg)) print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string()))
valid_align_path = args.valid_pre + ".{}-{}.".format( src, tgt) + args.align_suffix make_binary_alignment_dataset(valid_align_path, "valid.align", src, tgt, num_workers=args.workers) if args.test_pre: test_align_path = args.test_pre + ".{}-{}.".format( src, tgt) + args.align_suffix make_binary_alignment_dataset(test_align_path, "test.align", src, tgt, num_workers=args.workers) for src in args.source_langs: for tgt in args.target_langs: make_all(src, tgt) if args.align_suffix: make_all_alignments(src, tgt) print("| Wrote preprocessed data to {}".format(args.dest_dir)) if __name__ == "__main__": parser = options.get_parser('Preprocessing', default_task='translation') add_preprocess_args(parser) args = parser.parse_args() main(args)
def main(): parser = options.get_parser('Trainer') dataset_args = options.add_dataset_args(parser) dataset_args.add_argument('--max-tokens', default=6000, type=int, metavar='N', help='maximum number of tokens in a batch') dataset_args.add_argument('--max-sentences', type=int, metavar='N', help='maximum number of sentences in a batch') dataset_args.add_argument('--train-subset', default='train', metavar='SPLIT', choices=['train', 'valid', 'test'], help='data subset to use for training (train, valid, test)') dataset_args.add_argument('--valid-subset', default='valid', metavar='SPLIT', help='comma separated list of data subsets ' ' to use for validation (train, valid, valid1,test, test1)') options.add_optimization_args(parser) options.add_checkpoint_args(parser) options.add_model_args(parser) args = utils.parse_args_and_arch(parser) if args.no_progress_bar and args.log_format is None: args.log_format = 'simple' if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) torch.manual_seed(args.seed) # Load dataset splits = ['train', 'valid'] if data.has_binary_files(args.data, splits): dataset = data.load_dataset(args.data, splits, args.source_lang, args.target_lang) else: dataset = data.load_raw_text_dataset(args.data, splits, args.source_lang, args.target_lang) if args.source_lang is None or args.target_lang is None: # record inferred languages in args, so that it's saved in checkpoints args.source_lang, args.target_lang = dataset.src, dataset.dst print(args) print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) for split in splits: print('| {} {} {} examples'.format(args.data, split, len(dataset.splits[split]))) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') num_gpus = torch.cuda.device_count() print('| using {} GPUs (with max tokens per GPU = {} and max sentences per GPU = {})'.format( num_gpus, args.max_tokens, args.max_sentences)) # Build model and criterion model = utils.build_model(args, dataset.src_dict, dataset.dst_dict) criterion = utils.build_criterion(args, dataset.src_dict, dataset.dst_dict) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) # The max number of positions can be different for train and valid # e.g., RNNs may support more positions at test time than seen in training max_positions_train = (args.max_source_positions, args.max_target_positions) max_positions_valid = ( min(args.max_source_positions, model.max_encoder_positions()), min(args.max_target_positions, model.max_decoder_positions()) ) # Start multiprocessing trainer = MultiprocessingTrainer(args, model, criterion) # Load the latest checkpoint if one is available checkpoint_path = os.path.join(args.save_dir, args.restore_file) extra_state = trainer.load_checkpoint(checkpoint_path) if extra_state is not None: epoch = extra_state['epoch'] batch_offset = extra_state['batch_offset'] print('| loaded checkpoint {} (epoch {})'.format(checkpoint_path, epoch)) if batch_offset == 0: epoch += 1 else: epoch, batch_offset = 1, 0 # Train until the learning rate gets too small val_loss = None max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() while lr > args.min_lr and epoch <= max_epoch: # train for one epoch train(args, epoch, batch_offset, trainer, dataset, max_positions_train, num_gpus) # evaluate on validate set for k, subset in enumerate(args.valid_subset.split(',')): val_loss = validate(args, epoch, trainer, dataset, max_positions_valid, subset, num_gpus) if k == 0: if not args.no_save: # save checkpoint save_checkpoint(trainer, args, epoch, 0, val_loss) # only use first validation loss to update the learning schedule lr = trainer.lr_step(val_loss, epoch) epoch += 1 batch_offset = 0 train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum)) # Stop multiprocessing trainer.stop()
def main(): parser = options.get_parser('Trainer') dataset_args = options.add_dataset_args(parser) dataset_args.add_argument('--max-tokens', default=0, type=int, metavar='N', help='maximum number of tokens in a batch') dataset_args.add_argument('--batch-size', default=32, type=int, metavar='N', help='batch size') dataset_args.add_argument('--test-batch-size', default=32, type=int, metavar='N', help='batch size for test set') dataset_args.add_argument('--valid-batch-size', default=32, type=int, metavar='N', help='batch size for validation set') dataset_args.add_argument( '--train-subset', default='train', metavar='SPLIT', choices=['train', 'valid', 'test'], help='data subset to use for training (train, valid, test)') dataset_args.add_argument( '--valid-subset', default='valid', metavar='SPLIT', help='comma separated list ofdata subsets ' ' to use for validation (train, valid, valid1,test, test1)') dataset_args.add_argument('--test-subset', default='test', metavar='SPLIT', help='comma separated list ofdata subset ' 'to use for testing (train, valid, test)') dataset_args.add_argument( '--valid-script', nargs='+', metavar='PATH', help='path to external validation script (optional).') options.add_optimization_args(parser) options.add_checkpoint_args(parser) options.add_model_args(parser) args = utils.parse_args_and_arch(parser) print(args) if args.no_progress_bar: progress_bar.enabled = False progress_bar.print_interval = args.log_interval if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) torch.manual_seed(args.seed) # Setting args.max_tokens to infinity(same as setting to None) if args.max_tokens == 0: args.max_tokens = None # Load dataset dataset = data.load_with_check(args.data, args.source_lang, args.target_lang) if args.source_lang is None or args.target_lang is None: # record inferred languages in args, so that it's saved in checkpoints args.source_lang, args.target_lang = dataset.src, dataset.dst print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) for split in dataset.splits: print('| {} {} {} examples'.format(args.data, split, len(dataset.splits[split]))) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') num_gpus = torch.cuda.device_count() print('| using {} GPUs (with max tokens per GPU = {})'.format( num_gpus, args.max_tokens)) # Build model print('| model {}'.format(args.arch)) model = utils.build_model(args, dataset) criterion = utils.build_criterion(args, dataset) # Start multiprocessing trainer = MultiprocessingTrainer(args, model) # Load the latest checkpoint if one is available epoch, batch_offset = trainer.load_checkpoint( os.path.join(args.save_dir, args.restore_file)) # Train until the learning rate gets too small val_loss = None max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() while lr > args.min_lr and epoch <= max_epoch: # train for one epoch train(args, epoch, batch_offset, trainer, criterion, dataset, num_gpus) # evaluate on validate set for k, subset in enumerate(args.valid_subset.split(',')): val_loss = validate(args, epoch, trainer, criterion, dataset, subset, num_gpus) if k == 0: if not args.no_save: # save checkpoint trainer.save_checkpoint( args, epoch, 0, val_loss, validation_script=args.valid_script) # only use first validation loss to update the learning schedule lr = trainer.lr_step(val_loss, epoch) epoch += 1 batch_offset = 0 train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum)) # Generate on test set and compute BLEU score for beam in [1, 5, 10, 20]: for subset in args.test_subset.split(','): scorer = score_test(args, trainer.get_model(), dataset, subset, beam, cuda_device=(0 if num_gpus > 0 else None)) print('| Test on {} with beam={}: {}'.format( subset, beam, scorer.result_string())) # Stop multiprocessing trainer.stop()
def get_lm_scorer_parser(default_task='language_modeling'): parser = options.get_parser('Evaluate Language Model', default_task) options.add_dataset_args(parser, gen=True) options.add_common_eval_args(parser) add_lm_scorer_args(parser) return parser
def get_parser_with_args(): parser = options.get_parser("Generation", default_task="pytorch_translate") pytorch_translate_options.add_verbosity_args(parser) pytorch_translate_options.add_dataset_args(parser, gen=True) generation_group = options.add_generation_args(parser) pytorch_translate_options.expand_generation_args(generation_group) # Adds args used by the standalone generate binary. generation_group.add_argument( "--source-vocab-file", default="", metavar="FILE", help="Path to text file representing the Dictionary to use.", ) generation_group.add_argument( "--char-source-vocab-file", default="", metavar="FILE", help=( "Same as --source-vocab-file except using characters. " "(For use with char_source models only.)" ), ) generation_group.add_argument( "--target-vocab-file", default="", metavar="FILE", help="Path to text file representing the Dictionary to use.", ) generation_group.add_argument( "--source-text-file", default="", nargs="+", metavar="FILE", help="Path to raw text file containing examples in source dialect. " "This overrides what would be loaded from the data dir. " "You can specify multiple source files (eg. for use in combination " "with --source-ensembling). By default this will only translate the " "first source file", ) generation_group.add_argument( "--target-text-file", default="", metavar="FILE", help="Path to raw text file containing examples in target dialect. " "This overrides what would be loaded from the data dir.", ) generation_group.add_argument( "--source-binary-file", default="", help="Path for the binary file containing source eval examples. " "(Overrides --source-text-file. Must be used in conjunction with " "--target-binary-file).", ) generation_group.add_argument( "--target-binary-file", default="", help="Path for the binary file containing target eval examples. " "(Overrides --target-text-file. Must be used in conjunction with " "--source-binary-file).", ) generation_group.add_argument( "--translation-output-file", default="", type=str, metavar="FILE", help="Path to text file to store the output of the model. ", ) generation_group.add_argument( "--translation-probs-file", default="", type=str, metavar="FILE", help="Path to text file to store the probs of translation output. ", ) generation_group.add_argument( "--multiling-source-lang", action="append", metavar="SRC", help=( "Must be set for decoding with multilingual models. " "Must match an entry from --multiling-encoder-lang from training." ), ) generation_group.add_argument( "--multiling-target-lang", action="append", metavar="TARGET", help=( "Must be set for decoding with multilingual models. " "Must match an entry from --multiling-decoder-lang from training." ), ) generation_group.add_argument( "--source-ensembling", action="store_true", help="If this flag is present, the model will ensemble the predictions " "conditioned on multiple source sentences (one per source-text-file)", ) generation_group.add_argument( "--competing-completed-beam-search", action="store_true", help="If this flag is present, use the alternative beam search " "implementation in research/beam_search. This beam search keeps completed " "hypos in the beam and let them compete against hypo expansions in the " "next time step.", ) return parser
def main(): parser = options.get_parser('Generation') parser.add_argument('--path', metavar='FILE', required=True, action='append', help='path(s) to model file(s)') dataset_args = options.add_dataset_args(parser) dataset_args.add_argument('--batch-size', default=32, type=int, metavar='N', help='batch size') dataset_args.add_argument('--gen-subset', default='test', metavar='SPLIT', help='data subset to generate (train, valid, test)') options.add_generation_args(parser) args = parser.parse_args() if args.no_progress_bar and args.log_format is None: args.log_format = 'none' print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset if args.replace_unk is None: dataset = data.load_dataset(args.data, [args.gen_subset], args.source_lang, args.target_lang) else: dataset = data.load_raw_text_dataset(args.data, [args.gen_subset], args.source_lang, args.target_lang) if args.source_lang is None or args.target_lang is None: # record inferred languages in args args.source_lang, args.target_lang = dataset.src, dataset.dst # Load ensemble print('| loading model(s) from {}'.format(', '.join(args.path))) models, _ = utils.load_ensemble_for_inference(args.path, dataset.src_dict, dataset.dst_dict) print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) print('| {} {} {} examples'.format(args.data, args.gen_subset, len(dataset.splits[args.gen_subset]))) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam) # Initialize generator translator = SequenceGenerator( models, beam_size=args.beam, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen, unk_penalty=args.unkpen) if use_cuda: translator.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Generate and compute BLEU score scorer = bleu.Scorer(dataset.dst_dict.pad(), dataset.dst_dict.eos(), dataset.dst_dict.unk()) max_positions = min(model.max_encoder_positions() for model in models) itr = dataset.eval_dataloader( args.gen_subset, max_sentences=args.batch_size, max_positions=max_positions, skip_invalid_size_inputs_valid_test=args.skip_invalid_size_inputs_valid_test) num_sentences = 0 with utils.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() gen_timer = StopwatchMeter() translations = translator.generate_batched_itr( t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda_device=0 if use_cuda else None, timer=gen_timer) for sample_id, src_tokens, target_tokens, hypos in translations: # Process input and ground truth target_tokens = target_tokens.int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = dataset.splits[args.gen_subset].src.get_original_text(sample_id) target_str = dataset.splits[args.gen_subset].dst.get_original_text(sample_id) else: src_str = dataset.src_dict.string(src_tokens, args.remove_bpe) target_str = dataset.dst_dict.string(target_tokens, args.remove_bpe, escape_unk=True) if not args.quiet: print('S-{}\t{}'.format(sample_id, src_str)) print('T-{}\t{}'.format(sample_id, target_str)) # Process top predictions for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu(), align_dict=align_dict, dst_dict=dataset.dst_dict, remove_bpe=args.remove_bpe) if not args.quiet: print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], hypo_str)) print('A-{}\t{}'.format(sample_id, ' '.join(map(str, alignment)))) # Score only the top hypothesis if i == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tokenizer.Tokenizer.tokenize(target_str, dataset.dst_dict, add_if_not_exist=True) scorer.add(target_tokens, hypo_tokens) wps_meter.update(src_tokens.size(0)) t.log({'wps': round(wps_meter.avg)}) num_sentences += 1 print('| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} tokens/s)'.format( num_sentences, gen_timer.n, gen_timer.sum, 1. / gen_timer.avg)) print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string()))