def main(): parser = options.get_parser('Trainer') dataset_args = options.add_dataset_args(parser) dataset_args.add_argument('--max-tokens', default=6000, type=int, metavar='N', help='maximum number of tokens in a batch') dataset_args.add_argument( '--train-subset', default='train', metavar='SPLIT', choices=['train', 'valid', 'test'], help='data subset to use for training (train, valid, test)') dataset_args.add_argument( '--valid-subset', default='valid', metavar='SPLIT', help='comma separated list ofdata subsets ' ' to use for validation (train, valid, valid1,test, test1)') options.add_optimization_args(parser) options.add_checkpoint_args(parser) options.add_model_args(parser) args = utils.parse_args_and_arch(parser) print(args) if args.no_progress_bar: progress_bar.enabled = False progress_bar.print_interval = args.log_interval if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) torch.manual_seed(args.seed) # Load dataset dataset = data.load_with_check(args.data, ['train', 'valid'], args.source_lang, args.target_lang) if args.source_lang is None or args.target_lang is None: # record inferred languages in args, so that it's saved in checkpoints args.source_lang, args.target_lang = dataset.src, dataset.dst print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) for split in ['train', 'valid']: print('| {} {} {} examples'.format(args.data, split, len(dataset.splits[split]))) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') num_gpus = torch.cuda.device_count() print('| using {} GPUs (with max tokens per GPU = {})'.format( num_gpus, args.max_tokens)) # Build model and criterion model = utils.build_model(args, dataset.src_dict, dataset.dst_dict) criterion = utils.build_criterion(args, dataset.src_dict, dataset.dst_dict) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) # Start multiprocessing trainer = MultiprocessingTrainer(args, model, criterion) # Load the latest checkpoint if one is available checkpoint_path = os.path.join(args.save_dir, args.restore_file) extra_state = trainer.load_checkpoint(checkpoint_path) if extra_state is not None: epoch = extra_state['epoch'] batch_offset = extra_state['batch_offset'] print('| loaded checkpoint {} (epoch {})'.format( checkpoint_path, epoch)) if batch_offset == 0: epoch += 1 else: epoch, batch_offset = 1, 0 # Train until the learning rate gets too small val_loss = None max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() while lr > args.min_lr and epoch <= max_epoch: # train for one epoch train(args, epoch, batch_offset, trainer, dataset, num_gpus) # evaluate on validate set for k, subset in enumerate(args.valid_subset.split(',')): val_loss = validate(args, epoch, trainer, dataset, subset, num_gpus) if k == 0: if not args.no_save: # save checkpoint save_checkpoint(trainer, args, epoch, 0, val_loss) # only use first validation loss to update the learning schedule lr = trainer.lr_step(val_loss, epoch) epoch += 1 batch_offset = 0 train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum)) # Stop multiprocessing trainer.stop()
def main(): parser = options.get_parser('Trainer') dataset_args = options.add_dataset_args(parser) dataset_args.add_argument('--max-tokens', default=0, type=int, metavar='N', help='maximum number of tokens in a batch') dataset_args.add_argument('--batch-size', default=32, type=int, metavar='N', help='batch size') dataset_args.add_argument('--test-batch-size', default=32, type=int, metavar='N', help='batch size for test set') dataset_args.add_argument('--valid-batch-size', default=32, type=int, metavar='N', help='batch size for validation set') dataset_args.add_argument( '--train-subset', default='train', metavar='SPLIT', choices=['train', 'valid', 'test'], help='data subset to use for training (train, valid, test)') dataset_args.add_argument( '--valid-subset', default='valid', metavar='SPLIT', help='comma separated list ofdata subsets ' ' to use for validation (train, valid, valid1,test, test1)') dataset_args.add_argument('--test-subset', default='test', metavar='SPLIT', help='comma separated list ofdata subset ' 'to use for testing (train, valid, test)') dataset_args.add_argument( '--valid-script', nargs='+', metavar='PATH', help='path to external validation script (optional).') options.add_optimization_args(parser) options.add_checkpoint_args(parser) options.add_model_args(parser) args = utils.parse_args_and_arch(parser) print(args) if args.no_progress_bar: progress_bar.enabled = False progress_bar.print_interval = args.log_interval if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) torch.manual_seed(args.seed) # Setting args.max_tokens to infinity(same as setting to None) if args.max_tokens == 0: args.max_tokens = None # Load dataset dataset = data.load_with_check(args.data, args.source_lang, args.target_lang) if args.source_lang is None or args.target_lang is None: # record inferred languages in args, so that it's saved in checkpoints args.source_lang, args.target_lang = dataset.src, dataset.dst print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) for split in dataset.splits: print('| {} {} {} examples'.format(args.data, split, len(dataset.splits[split]))) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') num_gpus = torch.cuda.device_count() print('| using {} GPUs (with max tokens per GPU = {})'.format( num_gpus, args.max_tokens)) # Build model print('| model {}'.format(args.arch)) model = utils.build_model(args, dataset) criterion = utils.build_criterion(args, dataset) # Start multiprocessing trainer = MultiprocessingTrainer(args, model) # Load the latest checkpoint if one is available epoch, batch_offset = trainer.load_checkpoint( os.path.join(args.save_dir, args.restore_file)) # Train until the learning rate gets too small val_loss = None max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() while lr > args.min_lr and epoch <= max_epoch: # train for one epoch train(args, epoch, batch_offset, trainer, criterion, dataset, num_gpus) # evaluate on validate set for k, subset in enumerate(args.valid_subset.split(',')): val_loss = validate(args, epoch, trainer, criterion, dataset, subset, num_gpus) if k == 0: if not args.no_save: # save checkpoint trainer.save_checkpoint( args, epoch, 0, val_loss, validation_script=args.valid_script) # only use first validation loss to update the learning schedule lr = trainer.lr_step(val_loss, epoch) epoch += 1 batch_offset = 0 train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum)) # Generate on test set and compute BLEU score for beam in [1, 5, 10, 20]: for subset in args.test_subset.split(','): scorer = score_test(args, trainer.get_model(), dataset, subset, beam, cuda_device=(0 if num_gpus > 0 else None)) print('| Test on {} with beam={}: {}'.format( subset, beam, scorer.result_string())) # Stop multiprocessing trainer.stop()
def main(): parser = options.get_parser('Generation') parser.add_argument('--path', metavar='FILE', required=True, action='append', help='path(s) to model file(s)') dataset_args = options.add_dataset_args(parser) dataset_args.add_argument('-i', '--interactive', action='store_true', help='generate translations in interactive mode') dataset_args.add_argument('--batch-size', default=32, type=int, metavar='N', help='batch size') dataset_args.add_argument( '--gen-subset', default='test', metavar='SPLIT', help='data subset to generate (train, valid, test)') options.add_generation_args(parser) args = parser.parse_args() print(args) if args.no_progress_bar: progress_bar.enabled = False use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset dataset = data.load_with_check(args.data, [args.gen_subset], args.source_lang, args.target_lang) if args.source_lang is None or args.target_lang is None: # record inferred languages in args args.source_lang, args.target_lang = dataset.src, dataset.dst # Load ensemble print('| loading model(s) from {}'.format(', '.join(args.path))) models = utils.load_ensemble_for_inference(args.path, dataset.src_dict, dataset.dst_dict) print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) if not args.interactive: print('| {} {} {} examples'.format( args.data, args.gen_subset, len(dataset.splits[args.gen_subset]))) # Max positions is the model property but it is needed in data reader to be able to # ignore too long sentences args.max_positions = min(args.max_positions, *(m.decoder.max_positions() for m in models)) # Optimize ensemble for generation for model in models: model.make_generation_fast_(not args.no_beamable_mm) # Initialize generator translator = SequenceGenerator(models, beam_size=args.beam, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen) if use_cuda: translator.cuda() # Load alignment dictionary for unknown word replacement align_dict = {} if args.unk_replace_dict != '': assert args.interactive, \ 'Unknown word replacement requires access to original source and is only supported in interactive mode' with open(args.unk_replace_dict, 'r') as f: for line in f: l = line.split() align_dict[l[0]] = l[1] def replace_unk(hypo_str, align_str, src, unk): hypo_tokens = hypo_str.split() src_tokens = tokenizer.tokenize_line(src) align_idx = [int(i) for i in align_str.split()] for i, ht in enumerate(hypo_tokens): if ht == unk: src_token = src_tokens[align_idx[i]] if src_token in align_dict: hypo_tokens[i] = align_dict[src_token] else: hypo_tokens[i] = src_token return ' '.join(hypo_tokens) def display_hypotheses(id, src, orig, ref, hypos): if args.quiet: return id_str = '' if id is None else '-{}'.format(id) src_str = dataset.src_dict.string(src, args.remove_bpe) print('S{}\t{}'.format(id_str, src_str)) if orig is not None: print('O{}\t{}'.format(id_str, orig.strip())) if ref is not None: print('T{}\t{}'.format( id_str, dataset.dst_dict.string(ref, args.remove_bpe, escape_unk=True))) for hypo in hypos: hypo_str = dataset.dst_dict.string(hypo['tokens'], args.remove_bpe) align_str = ' '.join(map(str, hypo['alignment'])) if args.unk_replace_dict != '': hypo_str = replace_unk(hypo_str, align_str, orig, dataset.dst_dict.unk_string()) print('H{}\t{}\t{}'.format(id_str, hypo['score'], hypo_str)) print('A{}\t{}'.format(id_str, align_str)) if args.interactive: for line in sys.stdin: tokens = tokenizer.Tokenizer.tokenize( line, dataset.src_dict, add_if_not_exist=False).long() start = dataset.src_dict.pad() + 1 positions = torch.arange(start, start + len(tokens)).type_as(tokens) if use_cuda: positions = positions.cuda() tokens = tokens.cuda() translations = translator.generate(Variable(tokens.view(1, -1)), Variable(positions.view(1, -1))) hypos = translations[0] display_hypotheses(None, tokens, line, None, hypos[:min(len(hypos), args.nbest)]) else: def maybe_remove_bpe(tokens, escape_unk=False): """Helper for removing BPE symbols from a hypothesis.""" if args.remove_bpe is None: return tokens assert (tokens == dataset.dst_dict.pad()).sum() == 0 hypo_minus_bpe = dataset.dst_dict.string(tokens, args.remove_bpe, escape_unk) return tokenizer.Tokenizer.tokenize(hypo_minus_bpe, dataset.dst_dict, add_if_not_exist=True) # Generate and compute BLEU score scorer = bleu.Scorer(dataset.dst_dict.pad(), dataset.dst_dict.eos(), dataset.dst_dict.unk()) itr = dataset.dataloader(args.gen_subset, batch_size=args.batch_size, max_positions=args.max_positions, skip_invalid_size_inputs_valid_test=args. skip_invalid_size_inputs_valid_test) num_sentences = 0 with progress_bar(itr, smoothing=0, leave=False) as t: wps_meter = TimeMeter() gen_timer = StopwatchMeter() translations = translator.generate_batched_itr( t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda_device=0 if use_cuda else None, timer=gen_timer) for id, src, ref, hypos in translations: ref = ref.int().cpu() top_hypo = hypos[0]['tokens'].int().cpu() scorer.add(maybe_remove_bpe(ref, escape_unk=True), maybe_remove_bpe(top_hypo)) display_hypotheses(id, src, None, ref, hypos[:min(len(hypos), args.nbest)]) wps_meter.update(src.size(0)) t.set_postfix(wps='{:5d}'.format(round(wps_meter.avg)), refresh=False) num_sentences += 1 print( '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, 1. / gen_timer.avg)) print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string()))