def save_checkpoint(args, trainer, epoch_itr, val_loss): if args.no_save or not distributed_utils.is_master(args): return write_timer = StopwatchMeter() write_timer.start() epoch = epoch_itr.epoch end_of_epoch = epoch_itr.end_of_epoch() updates = trainer.get_num_updates() checkpoint_conds = collections.OrderedDict() checkpoint_conds['checkpoint{}.pt'.format(epoch)] = ( end_of_epoch and not args.no_epoch_checkpoints and epoch % args.save_interval == 0) checkpoint_conds['checkpoint_{}_{}.pt'.format( epoch, updates)] = (not end_of_epoch and args.save_interval_updates > 0 and updates % args.save_interval_updates == 0) checkpoint_conds['checkpoint_best.pt'] = ( val_loss is not None and (not hasattr(save_checkpoint, 'best') or val_loss < save_checkpoint.best)) checkpoint_conds[ 'checkpoint_last.pt'] = True # keep this last so that it's a symlink prev_best = getattr(save_checkpoint, 'best', val_loss) if val_loss is not None: save_checkpoint.best = min(val_loss, prev_best) extra_state = { 'train_iterator': epoch_itr.state_dict(), 'val_loss': val_loss, } if hasattr(save_checkpoint, 'best'): extra_state.update({'best': save_checkpoint.best}) checkpoints = [ os.path.join(args.save_dir, fn) for fn, cond in checkpoint_conds.items() if cond ] if len(checkpoints) > 0: trainer.save_checkpoint(checkpoints[0], extra_state) for cp in checkpoints[1:]: shutil.copyfile(checkpoints[0], cp) write_timer.stop() print( '| saved checkpoint {} (epoch {} @ {} updates) (writing took {} seconds)' .format(checkpoints[0], epoch, updates, write_timer.sum)) if not end_of_epoch and args.keep_interval_updates > 0: # remove old checkpoints; checkpoints are sorted in descending order checkpoints = checkpoint_paths( args.save_dir, pattern=r'checkpoint_\d+_(\d+)\.pt', ) for old_chk in checkpoints[args.keep_interval_updates:]: if os.path.lexists(old_chk): os.remove(old_chk) if args.keep_last_epochs > 0: # remove old epoch checkpoints; checkpoints are sorted in descending order checkpoints = checkpoint_paths( args.save_dir, pattern=r'checkpoint(\d+)\.pt', ) for old_chk in checkpoints[args.keep_last_epochs:]: if os.path.lexists(old_chk): os.remove(old_chk)
def main(parsed_args): assert parsed_args.path is not None, '--path required for evaluation!' utils.import_user_module(parsed_args) print(parsed_args) use_cuda = torch.cuda.is_available() and not parsed_args.cpu task = tasks.setup_task(parsed_args) # Load ensemble print('| loading model(s) from {}'.format(parsed_args.path)) models, args = checkpoint_utils.load_model_ensemble( parsed_args.path.split(':'), arg_overrides=eval(parsed_args.model_overrides), task=task, ) for arg in vars(parsed_args).keys(): if arg not in { 'self_target', 'future_target', 'past_target', 'tokens_per_sample', 'output_size_dictionary', 'add_bos_token', }: setattr(args, arg, getattr(parsed_args, arg)) # reduce tokens per sample by the required context window size if args.task != "translation": args.tokens_per_sample -= args.context_window task = tasks.setup_task(args) # Load dataset splits task.load_dataset(args.gen_subset) dataset = task.dataset(args.gen_subset) if args.context_window > 0: dataset = LMContextWindowDataset( dataset=dataset, tokens_per_sample=args.tokens_per_sample, context_window=args.context_window, pad_idx=task.source_dictionary.pad(), ) print('| {} {} {} examples'.format(args.data, args.gen_subset, len(dataset))) # Optimize ensemble for generation and set the source and dest dicts on the model (required by scorer) for model in models: model.make_generation_fast_() if args.fp16: model.half() if use_cuda: model.cuda() assert len(models) > 0 print('num. model params: {}'.format(sum(p.numel() for p in models[0].parameters()))) itr = task.get_batch_iterator( dataset=dataset, max_tokens=args.max_tokens or 36000, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions(*[ model.max_positions() for model in models ]), ignore_invalid_inputs=True, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) gen_timer = StopwatchMeter() scorer = SequenceScorer(task.target_dictionary, args.softmax_batch) score_sum = 0. count = 0 if args.remove_bpe is not None: if args.remove_bpe == 'sentencepiece': raise NotImplementedError else: bpe_cont = args.remove_bpe.rstrip() bpe_toks = set( i for i in range(len(task.source_dictionary)) if task.source_dictionary[i].endswith(bpe_cont) ) bpe_len = len(bpe_cont) else: bpe_toks = None bpe_len = 0 word_stats = dict() with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: if 'net_input' not in sample: continue sample = utils.move_to_cuda(sample) if use_cuda else sample gen_timer.start() hypos = scorer.generate(models, sample) gen_timer.stop(sample['ntokens']) for i, hypos_i in enumerate(hypos): hypo = hypos_i[0] sample_id = sample['id'][i] tokens = hypo['tokens'] tgt_len = tokens.numel() pos_scores = hypo['positional_scores'].float() output_dists = hypo['output_distributions'].float() if args.task != "translation" and args.add_bos_token: assert hypo['tokens'][0].item() == task.target_dictionary.bos() tokens = tokens[1:] pos_scores = pos_scores[1:] skipped_toks = 0 if bpe_toks is not None: for i in range(tgt_len - 1): if tokens[i].item() in bpe_toks: skipped_toks += 1 pos_scores[i + 1] += pos_scores[i] pos_scores[i] = 0 inf_scores = pos_scores.eq(float('inf')) | pos_scores.eq(float('-inf')) if inf_scores.any(): print('| Skipping tokens with inf scores:', task.target_dictionary.string(tokens[inf_scores.nonzero()])) pos_scores = pos_scores[(~inf_scores).nonzero()] score_sum += pos_scores.sum().cpu() count += pos_scores.numel() - skipped_toks if args.output_distributions_folder is not None: if not os.path.exists(args.output_distributions_folder): os.mkdir(args.output_distributions_folder) od_filename = f"output-dist-{sample_id}.npy" np.save(os.path.join(args.output_distributions_folder, od_filename), output_dists.cpu().numpy()) if args.output_word_probs or args.output_word_stats: w = '' word_prob = [] is_bpe = False for i in range(len(tokens)): w_ind = tokens[i].item() w += task.source_dictionary[w_ind] if bpe_toks is not None and w_ind in bpe_toks: w = w[:-bpe_len] is_bpe = True else: word_prob.append((w, pos_scores[i].item())) next_prob = None ind = i + 1 while ind < len(tokens): if pos_scores[ind].item() != 0: next_prob = pos_scores[ind] break ind += 1 word_stats.setdefault(w, WordStat(w, is_bpe)).add(pos_scores[i].item(), next_prob) is_bpe = False w = '' if args.output_word_probs: print( str(int(sample_id)) + " " + ('\t'.join('{} [{:2f}]'.format(x[0], x[1]) for x in word_prob)) ) wps_meter.update(sample['ntokens']) t.log({'wps': round(wps_meter.avg)}) avg_nll_loss = -score_sum / count print('| Evaluated {} tokens in {:.1f}s ({:.2f} tokens/s)'.format(gen_timer.n, gen_timer.sum, 1. / gen_timer.avg)) print('| Loss: {:.4f}, Perplexity: {:.2f}'.format(avg_nll_loss, np.exp(avg_nll_loss))) if args.output_word_stats: for ws in sorted(word_stats.values(), key=lambda x: x.count, reverse=True): print(ws)
def main(args): if args.buffer_size < 1: args.buffer_size = 1 if args.max_tokens is None and args.max_sentences is None: args.max_sentences = 1 assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert not args.max_sentences or args.max_sentences <= args.buffer_size, \ '--max-sentences/--batch-size cannot be larger than --buffer-size' print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Setup task, e.g., translation task = tasks.setup_task(args) # Load ensemble print('| loading model(s) from {}'.format(args.path)) model_paths = args.path.split(':') models, model_args = utils.load_ensemble_for_inference( model_paths, task, model_arg_overrides=eval(args.model_overrides)) # Set dictionaries src_dict = task.source_dictionary tgt_dict = task.target_dictionary # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() # Initialize generator translator = SequenceGenerator( models, tgt_dict, beam_size=args.beam, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen, unk_penalty=args.unkpen, sampling=args.sampling, sampling_topk=args.sampling_topk, minlen=args.min_len, sampling_temperature=args.sampling_temperature) if use_cuda: translator.cuda() # Load BPE codes file if args.bpe_codes: codes = open(args.bpe_codes, 'r') bpe = BPE(codes) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) def make_result(src_str, hypos): result = Translation( src_str='O\t{}'.format(src_str), hypos=[], pos_scores=[], alignments=[], ) # Process top predictions for hypo in hypos[:min(len(hypos), args.nbest)]: hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) hypo_str = tokenizer.Tokenizer.detokenize(hypo_str, 'de') result.hypos.append('H\t{}\t{}'.format(hypo['score'], hypo_str)) result.pos_scores.append('P\t{}'.format(' '.join( map( lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist(), )))) result.alignments.append('A\t{}'.format(' '.join( map(lambda x: str(utils.item(x)), alignment))) if args. print_alignment else None) return result gen_timer = StopwatchMeter() end2end_timer = StopwatchMeter() def process_batch(batch): tokens = batch.tokens lengths = batch.lengths if use_cuda: tokens = tokens.cuda() lengths = lengths.cuda() gen_timer.start() translations = translator.generate( tokens, lengths, maxlen=int(args.max_len_a * tokens.size(1) + args.max_len_b), ) gen_timer.stop() return [ make_result(batch.srcs[i], t) for i, t in enumerate(translations) ] if args.buffer_size > 1: print('| Sentence buffer size:', args.buffer_size) print('| Type the input sentence and press return:') for inputs in buffered_read(args.buffer_size): indices = [] results = [] end2end_timer.start() for batch, batch_indices in make_batches(inputs, args, src_dict, models[0].max_positions(), bpe): indices.extend(batch_indices) results += process_batch(batch) for i in np.argsort(indices): result = results[i] print(result.src_str) for hypo, pos_scores, align in zip(result.hypos, result.pos_scores, result.alignments): print(hypo) print(pos_scores) if align is not None: print(align) print('Model latency: {} s'.format(gen_timer.sum)) gen_timer.reset() end2end_timer.stop() print('End-to-end translation time: {} s'.format(end2end_timer.sum)) end2end_timer.reset()
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits task.load_dataset(args.train_subset, combine=True, epoch=0) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=True, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) # Initialize dataloader epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=args.required_batch_size_multiple, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, ) # Load the latest checkpoint if one is available load_checkpoint(args, trainer, epoch_itr, max_positions, task) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) epoch_itr = reload_train(args, epoch_itr, max_positions, task) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def _main(args, output_file): logging.basicConfig( format='%(asctime)s | %(levelname)s | %(name)s | %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, stream=output_file, ) logger = logging.getLogger('fairseq_cli.generate') utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble logger.info('loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(os.pathsep), arg_overrides=eval(args.model_overrides), task=task, ) args.vocab_size = len(tgt_dict) for arg in vars(_model_args).keys(): if arg in {'decoder_embed_dim', 'vocab_size'}: setattr(args, arg, getattr(_model_args, arg)) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() if args.knnlm and args.save_knnlm_dstore: raise ValueError( "Cannot use knnlm while trying to build the datastore!") if args.knnlm: knn_dstore = KNN_Dstore(args) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) # Generate and compute BLEU score if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) if args.save_knnlm_dstore: print('keytype being saved:', args.knn_keytype) if args.dstore_fp16: print('Saving fp16') dstore_keys = np.memmap(args.dstore_mmap + '_keys.npy', dtype=np.float16, mode='w+', shape=(args.dstore_size, args.decoder_embed_dim)) dstore_vals = np.memmap(args.dstore_mmap + '_vals.npy', dtype=np.int16, mode='w+', shape=(args.dstore_size, 1)) else: print('Saving fp32') dstore_keys = np.memmap(args.dstore_mmap + '_keys.npy', dtype=np.float32, mode='w+', shape=(args.dstore_size, args.decoder_embed_dim)) dstore_vals = np.memmap(args.dstore_mmap + '_vals.npy', dtype=np.int, mode='w+', shape=(args.dstore_size, 1)) dstore_idx = 0 if args.save_knnlm_dstore or args.knnlm: # source_tokens_file = open(args.output_tokens_file_prefix + '.src' , 'w') target_tokens_file = open(args.output_tokens_file_prefix + '.tgt', 'w') # This is only for MT right now, use interactive.py for language modeling assert task != 'language_modeling' num_sentences = 0 has_target = True with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() if args.knnlm: hypos = task.inference_step(generator, models, sample, prefix_tokens, knn_dstore=knn_dstore) else: hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) if args.save_knnlm_dstore: for i, hypos_i in enumerate(hypos): hypo = hypos_i[0] shape = hypo['dstore_keys'].shape if dstore_idx + shape[0] > args.dstore_size: shape = [args.dstore_size - dstore_idx] hypo['dstore_keys'] = hypo['dstore_keys'][:shape[0]] # import pdb; pdb.set_trace() # print(hypo) if args.dstore_fp16: dstore_keys[dstore_idx:shape[0] + dstore_idx] = hypo['dstore_keys'].view( -1, args.decoder_embed_dim).cpu( ).numpy().astype(np.float16) dstore_vals[dstore_idx:shape[0] + dstore_idx] = hypo['tokens'].view( -1, 1).cpu().numpy().astype(np.int16) else: dstore_keys[dstore_idx:shape[0] + dstore_idx] = hypo['dstore_keys'].view( -1, args.decoder_embed_dim).cpu( ).numpy().astype(np.float32) dstore_vals[dstore_idx:shape[0] + dstore_idx] = hypo['tokens'].view( -1, 1).cpu().numpy().astype(np.int) dstore_idx += shape[0] if args.save_knnlm_dstore or args.knnlm: for i, hypos_i in enumerate(hypos): hypo = hypos_i[0] # dump the tokens to a file, used for analysis and interactive printing # source_tokens = [task.source_dictionary[token] for token in hypo['source_tokens']] # source_tokens_file.write('\n'.join(source_tokens) + '\n') target_tokens = [ task.target_dictionary[token] for token in hypo['tokens'] ] target_tokens_file.write('\n'.join(target_tokens) + '\n') for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad( sample['target'][i, :], tgt_dict.pad()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( args.gen_subset).tgt.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) else: src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) if not args.quiet: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str), file=output_file) if has_target: print('T-{}\t{}'.format(sample_id, target_str), file=output_file) # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) if not args.quiet: score = hypo['score'] / math.log( 2) # convert to base 2 print('H-{}\t{}\t{}'.format(sample_id, score, hypo_str), file=output_file) print( 'P-{}\t{}'.format( sample_id, ' '.join( map( lambda x: '{:.4f}'.format(x), # convert from base e to base 2 hypo['positional_scores'].div_( math.log(2)).tolist(), ))), file=output_file) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join([ '{}-{}'.format(src_idx, tgt_idx) for src_idx, tgt_idx in alignment ])), file=output_file) if args.print_step: print('I-{}\t{}'.format(sample_id, hypo['steps']), file=output_file) if getattr(args, 'retain_iter_history', False): for step, h in enumerate(hypo['history']): _, h_str, _ = utils.post_process_prediction( hypo_tokens=h['tokens'].int().cpu(), src_str=src_str, alignment=None, align_dict=None, tgt_dict=tgt_dict, remove_bpe=None, ) print('E-{}_{}\t{}'.format( sample_id, step, h_str), file=output_file) # Score only the top hypothesis if has_target and j == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True) if hasattr(scorer, 'add_string'): scorer.add_string(target_str, hypo_str) else: scorer.add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) t.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] logger.info('NOTE: hypothesis and token scores are output in base 2') logger.info( 'Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: logger.info('Generate {} with beam={}: {}'.format( args.gen_subset, args.beam, scorer.result_string())) if args.save_knnlm_dstore: print("dstore_idx", dstore_idx, "final shape", shape) print("Keys", dstore_keys.shape, dstore_keys.dtype) print("Vals", dstore_vals.shape, dstore_vals.dtype) target_tokens_file.seek(0) num_lines = len(target_tokens_file.readlines()) if dstore_idx != num_lines: print( 'Warning: size of KNN datastore is {}, does not match number of lines in train tokens file which is {}' .format(dstore_idx, num_lines)) if args.save_knnlm_dstore or args.knnlm: # source_tokens_file.close() target_tokens_file.close() return scorer
def main(args): print(args) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Load dataset splits = ['train', 'valid'] if data.has_binary_files(args.data, splits): dataset = data.load_dataset( args.data, splits, args.source_lang, args.target_lang) else: dataset = data.load_raw_text_dataset( args.data, splits, args.source_lang, args.target_lang) if args.source_lang is None or args.target_lang is None: # record inferred languages in args, so that it's saved in checkpoints args.source_lang, args.target_lang = dataset.src, dataset.dst print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) for split in splits: print('| {} {} {} examples'.format(args.data, split, len(dataset.splits[split]))) # Build model and criterion model = models.build_model(args, dataset.src_dict, dataset.dst_dict) criterion = criterions.build_criterion(args, dataset.src_dict, dataset.dst_dict) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format(sum(p.data.numel() for p in model.parameters()))) # Build trainer trainer = Trainer(args, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available os.makedirs(args.save_dir, exist_ok=True) checkpoint_path = os.path.join(args.save_dir, args.restore_file) extra_state = trainer.load_checkpoint(checkpoint_path) if extra_state is not None: epoch = extra_state['epoch'] batch_offset = extra_state['batch_offset'] print('| loaded checkpoint {} (epoch {})'.format(checkpoint_path, epoch)) if batch_offset == 0: trainer.lr_step(epoch) epoch += 1 else: epoch, batch_offset = 1, 0 # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() while lr > args.min_lr and epoch <= max_epoch: # train for one epoch train(args, trainer, dataset, epoch, batch_offset) # evaluate on validate set if epoch % args.validate_interval == 0: for k, subset in enumerate(args.valid_subset.split(',')): val_loss = validate(args, trainer, dataset, subset, epoch) if k == 0: # only use first validation loss to update the learning schedule lr = trainer.lr_step(epoch, val_loss) # save checkpoint if not args.no_save: save_checkpoint(trainer, args, epoch, 0, val_loss) else: lr = trainer.lr_step(epoch) epoch += 1 batch_offset = 0 if trainer.get_num_updates() >= max_update: break train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args): if args.max_tokens is None: args.max_tokens = 6000 print(args) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) torch.cuda.empty_cache() # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(args, task, ['train', 'valid']) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format( sum(p.numel() for p in model.parameters()))) # Build trainer if args.fp16: trainer = FP16Trainer(args, task, model, criterion) else: if torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) trainer = Trainer( args, task, model, criterion ) ##trainer seems to be generic enough to be used for wikicatsum print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) ignoredIndices = [] if args.outindices: print("* Filter examples with indices in: " + args.outindices) f = open(args.outindices, 'r') for line in f.readlines(): ignoredIndices.append(int(line.strip())) # Initialize dataloader max_positions = trainer.get_model().max_positions() epoch_itr = data.EpochBatchIterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences_valid, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, savedir=os.path.join(args.save_dir, ""), ignoredIndices=ignoredIndices, ) print("* Epoch batch iterator created nb. {}".format(len(epoch_itr))) # Load the latest checkpoint if one is available load_checkpoint(args, trainer, epoch_itr) # Send a dummy batch to warm the caching allocator dummy_batch = task.dataset('train').get_dummy_batch( args.max_tokens, max_positions) trainer.dummy_train_step(dummy_batch) print("Ok dummy step") # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args): assert args.path is not None, '--path required for generation!' assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert args.replace_unk is None or args.raw_text, \ '--replace-unk requires a raw text dataset (--raw-text)' import_user_module(args) """ MODIFIED: The GEC task uses token-labeled raw text datasets, which require raw text to be used. """ assert args.raw_text, \ f"--raw-text option is required for copy-based generation." if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) print('| {} {} {} examples'.format(args.data, args.gen_subset, len(task.dataset(args.gen_subset)))) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble print('| loading model(s) from {}'.format(args.path)) models, _model_args = utils.load_ensemble_for_inference( args.path.split(':'), task, model_arg_overrides=eval(args.model_overrides), ) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=8, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) # Generate and compute BLEU score if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) num_sentences = 0 has_target = True has_copy_scores = True with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() """ MODIFIED: Use copy scores to replace <unk>'s with raw source words. use_copy_scores may be False with non-copy-based transformers that only use edit labels (e.g., transformer_aux_el and transformer_el). """ hypos = task.inference_step(generator, models, sample, prefix_tokens) use_copy_scores = hypos[0][0].get('copy_scores', None) is not None if has_copy_scores and not use_copy_scores: print("| generate_or_copy.py | INFO | " "Model does not include copy scores. " "Generating hypotheses without replacing UNKs.") has_copy_scores = False num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad( sample['target'][i, :], tgt_dict.pad()).int().cpu() """ MODIFIED: Replace <unk>s with raw source tokens. This is analogous to the case where align_dict is provided in the original generate.py. """ rawtext_dataset = task.dataset(args.gen_subset) src_str = rawtext_dataset.src.get_original_text(sample_id) tokenized_src_str = rawtext_dataset.src_dict.string( src_tokens, bpe_symbol=args.remove_bpe) target_str = rawtext_dataset.tgt.get_original_text(sample_id) if not args.quiet: if src_dict is not None: # Raw source text print('S-{}\t{}'.format(sample_id, src_str)) # Tokenized source text print('K-{}\t{}'.format(sample_id, tokenized_src_str)) if has_target: print('T-{}\t{}'.format(sample_id, target_str)) # Process top predictions for k, hypo in enumerate( hypos[i][:min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) """ MODIFIED: Replace predicted <unk>s with the source token that received the highest score. """ raw_src_tokens = src_str.split() final_hypo_tokens_str = [] for tgt_position, hypo_token in enumerate(hypo_tokens): if use_copy_scores and hypo_token == tgt_dict.unk(): # See sequence_copygenerator.py#L292 for details. copy_scores = hypo[ 'copy_scores'][:, tgt_position].cpu() assert len(copy_scores) - 1 == len(raw_src_tokens), \ f"length of copy scores do not match input source tokens " \ f"(copy_scores: {copy_scores}, raw_src_tokens: {raw_src_tokens})" src_position = torch.argmax(copy_scores).item() # Don't copy if attending to an EOS (not ideal). if src_position == len(raw_src_tokens): print("WARNING: copy score highest at EOS.") else: final_hypo_tokens_str.append( raw_src_tokens[src_position]) print('U-{}\t{}\t{}'.format( sample_id, tgt_position, ' '.join( map( lambda x: '{:.4f}'.format(x), copy_scores.tolist(), )), )) else: final_hypo_tokens_str.append(tgt_dict[hypo_token]) # Note: raw input tokens could be included here. final_hypo_str = ' '.join([ token for token in final_hypo_tokens_str if token != tgt_dict.eos_word ]) if not args.quiet: print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], final_hypo_str)) print('P-{}\t{}'.format( sample_id, ' '.join( map( lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist(), )))) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join( map(lambda x: str(utils.item(x)), alignment)))) # Score only the top hypothesis if has_target and k == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True) if hasattr(scorer, 'add_string'): scorer.add_string(target_str, hypo_str) else: scorer.add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) t.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] print( '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string())) return scorer
def main(args): assert args.path is not None, '--path required for generation!' assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert args.replace_unk is None or args.raw_text, \ '--replace-unk requires a raw text dataset (--raw-text)' utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 # print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble print('| loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(':'), arg_overrides=eval(args.model_overrides), task=task, ) trained_epoch = checkpoint_utils.get_checkpoint_epoch(args.path.split(':')) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) # itr = task.get_batch_iterator( # dataset=task.dataset(args.gen_subset), # max_tokens=args.max_tokens, # max_sentences=args.max_sentences, # max_positions=utils.resolve_max_positions( # task.max_positions(), # *[model.max_positions() for model in models] # ), # ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, # required_batch_size_multiple=args.required_batch_size_multiple, # num_shards=args.num_shards, # shard_id=args.shard_id, # num_workers=args.num_workers, # ).next_epoch_itr(shuffle=False) # we modify to use the max_positions only from the task and not the model. # the reason is that we keep a low max positions while training transformer # to handle large batches, but we need to disable this while testing to get # metrics evaluated on full dev/test set. itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=task.max_positions(), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) # Generate and compute BLEU score # em_scorer = bleu.EmScorer() all_metrics = bleu.Metric() if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) num_sentences = 0 has_target = True all_preds = [] with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad( sample['target'][i, :], tgt_dict.pad()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( args.gen_subset).tgt.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) else: src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) all_preds.append({ 'id': sample_id, 'tgt_str': target_str, 'src_str': src_str, 'url': task.urls[sample_id] }) if not args.quiet: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str)) if has_target: print('T-{}\t{}'.format(sample_id, target_str)) # Process top predictions for i, hypo in enumerate( hypos[i][:min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) # print('=========') # print(hypo_tokens) # print(hypo_str) # print(align_dict) if i == 0: # get best hypothesis all_preds[-1]['hypo_str'] = hypo_str if not args.quiet: print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], hypo_str)) print('P-{}\t{}'.format( sample_id, ' '.join( map( lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist(), )))) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join( map(lambda x: str(utils.item(x)), alignment)))) # Score only the top hypothesis if has_target and i == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True) if hasattr(scorer, 'add_string'): scorer.add_string(target_str, hypo_str) else: scorer.add(target_tokens, hypo_tokens) all_metrics.add_string(target_str, hypo_str) wps_meter.update(num_generated_tokens) t.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] # we can dump the preds out in notebook format preds_dir = dirname(args.path) + '/preds' # sort them in order of index in dev/test set. all_preds.sort(key=lambda x: int(x['id'])) log_preds_to_notebook(preds=all_preds, outdir=preds_dir) print( '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string())) print(args.path) # compute, store, and print the metrics all_metrics.compute_metrics(trained_epoch) all_metrics.save(dirname(args.path) + '/metrics.json') print('All metrics:') print(all_metrics.result_string()) return all_metrics.get_metric('corpus_bleu'), all_metrics.get_metric('em')
def train(args, extra_state, trainer, dataset): # offset for current epoch (may be different from checkpoint offset) starting_offset = extra_state["batch_offset"] # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() do_prune = (args.pruning_percentile > 0) extra_state["retraining"] = False prune_masks = None while lr > args.min_lr and extra_state["epoch"] <= max_epoch: """Train the model for one epoch.""" itr, progress, extra_meters = setup_epoch( args=args, epoch=extra_state["epoch"], batch_offset=starting_offset, trainer=trainer, dataset=dataset, ) for i, sample in enumerate(itr, start=starting_offset): if extra_state["retraining"]: for name, params in trainer.model.named_parameters(): if "weight" in name: params.data[prune_masks[name]] = 0.0 log_output = trainer.train_step(sample) train_stats = log_mid_epoch_stats( trainer=trainer, progress=progress, extra_meters=extra_meters, log_output=log_output, ) if (args.continuous_averaging_after_epochs >= 0 and extra_state["epoch"] > args.continuous_averaging_after_epochs): model_param_dict = trainer.model.state_dict() if "param_totals" not in extra_state: extra_state["param_totals"] = {} for name, value in model_param_dict.items(): extra_state["param_totals"][name] = value.clone() extra_state["param_accum_count"] = 1 else: for name, value in model_param_dict.items(): extra_state["param_totals"][name] += value extra_state["param_accum_count"] += 1 if i == starting_offset: # ignore the first mini-batch in words-per-second calculation trainer.get_meter("wps").reset() num_updates = trainer.get_num_updates() do_validate = (args.subepoch_validate_interval > 0 and num_updates % args.subepoch_validate_interval == 0) do_save = (not args.no_save and args.save_interval > 0 and num_updates % args.save_interval == 0) do_eval_bleu = ( # We can only do BLEU eval when we have a new checkpoint to load. do_save and args.generate_bleu_eval_interval > 0 and num_updates - extra_state["last_bleu_eval"] >= args.generate_bleu_eval_interval) if do_eval_bleu: extra_state["last_bleu_eval"] = num_updates extra_state["batch_offset"] = i + 1 ( _, val_ppl, val_bleu, stop_training_mid_epoch, translation_samples, ) = validate_save_and_evaluate_bleu( args=args, trainer=trainer, dataset=dataset, extra_state=extra_state, do_validate=do_validate, do_save=do_save, do_eval_bleu=do_eval_bleu, ) yield ( trainer.get_num_updates(), { "train_ppl": train_stats["ppl"], "tune_ppl": val_ppl, "tune_bleu": val_bleu, "translation_samples": translation_samples, }, ) if stop_training_mid_epoch: break # log end-of-epoch stats train_stats = log_end_epoch_stats(trainer=trainer, progress=progress, extra_meters=extra_meters) # Run a training step if not stopping mid-epoch. if not stop_training_mid_epoch: # batch_offset being None denotes the end of an epoch. extra_state["batch_offset"] = None ( val_loss, val_ppl, val_bleu, stop_training_end_of_epoch, translation_samples, ) = validate_save_and_evaluate_bleu( args=args, trainer=trainer, dataset=dataset, extra_state=extra_state, do_validate=True, do_save=not args.no_save and not args.no_end_of_epoch_checkpoints, do_eval_bleu=args.generate_bleu_eval_per_epoch, ) extra_state["val_loss"] = val_loss yield ( trainer.get_num_updates(), { "train_ppl": train_stats["ppl"], "tune_ppl": val_ppl, "tune_bleu": val_bleu, "translation_samples": translation_samples, }, ) if stop_training_mid_epoch or stop_training_end_of_epoch: if do_prune and not extra_state["retraining"]: lr *= args.retrain_lr_ratio extra_state["validate"]["lowest_loss"] = np.inf extra_state["evaluate_bleu"]["best"] = 0 stop_training_mid_epoch = False stop_training_end_of_epoch = False prune_masks = prune(args, trainer) extra_state["retraining"] = True print("| Finished pruning and switching to retraining") else: break lr = trainer.lr_step(extra_state["epoch"], val_loss) extra_state["epoch"] += 1 extra_state["batch_offset"] = 0 starting_offset = 0 if is_training_over_time_limit(extra_state["start_time"], args.stop_time_hr): break train_meter.stop() print(f"| done training in {train_meter.sum:.1f} seconds") if "evaluate_bleu" in extra_state: print( f"| Best BLEU score of {extra_state['evaluate_bleu']['best']} was from " f"epoch {extra_state['evaluate_bleu']['best_epoch']}")
def main(args, init_distributed=False): utils.import_user_module(args) utils.handle_save_path(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training #if torch.cuda.is_available() and not args.cpu: # torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args print(f"| Configs: {args}") # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print( f"| Model: {args.arch} \n| Criterion: {criterion.__class__.__name__}") # Log architecture if args.train_subtransformer: print(" \n\n\t\tWARNING!!! Training one single SubTransformer\n\n") print( f"| SubTransformer Arch: {utils.get_subtransformer_config(args)} \n" ) else: print(" \n\n\t\tWARNING!!! Training SuperTransformer\n\n") print(f"| SuperTransformer Arch: {model} \n") # Log model size if args.train_subtransformer: print( f"| SubTransformer size (without embedding weights): {model.get_sampled_params_numel(utils.get_subtransformer_config(args))}" ) embed_size = args.decoder_embed_dim_subtransformer * len(task.tgt_dict) print(f"| Embedding layer size: {embed_size} \n") else: model_s = 0 # if use model.state_dict, then will add 2 more parameters, they are encoder.version and decoder.version. Should not count them for name, param in model.named_parameters(): if 'embed' not in name: model_s += param.numel() print( f"| SuperTransofmer model size (without embedding weights): {model_s}" ) print( f"| Embedding layer size: {sum(p.numel() for p in model.parameters() if p.requires_grad) - model_s} \n" ) # specify the length of the dummy input for profile # for iwslt, the average length is 23, for wmt, that is 30 dummy_sentence_length_dict = {'iwslt': 23, 'wmt': 30} if 'iwslt' in args.arch: dummy_sentence_length = dummy_sentence_length_dict['iwslt'] elif 'wmt' in args.arch: dummy_sentence_length = dummy_sentence_length_dict['wmt'] else: raise NotImplementedError dummy_src_tokens = [2] + [7] * (dummy_sentence_length - 1) dummy_prev = [7] * (dummy_sentence_length - 1) + [2] # profile the overall FLOPs number if args.profile_flops: import torchprofile config_subtransformer = utils.get_subtransformer_config(args) model.set_sample_config(config_subtransformer) model.profile(mode=True) macs = torchprofile.profile_macs(model, args=(torch.tensor([dummy_src_tokens], dtype=torch.long), torch.tensor([30]), torch.tensor([dummy_prev], dtype=torch.long))) model.profile(mode=False) last_layer_macs = config_subtransformer['decoder'][ 'decoder_embed_dim'] * dummy_sentence_length * len(task.tgt_dict) print(f"| Total FLOPs: {macs * 2}") print(f"| Last layer FLOPs: {last_layer_macs * 2}") print( f"| Total FLOPs without last layer: {(macs - last_layer_macs) * 2} \n" ) exit(0) with torch.autograd.set_detect_anomaly(True): # Build trainer trainer = Trainer(args, task, model, criterion) print(f"| Training on {args.distributed_world_size} GPUs") # print(f"| Max tokens per GPU = {args.max_tokens} and max sentences per GPU = {args.max_sentences} \n") print( f"| Max tokens per GPU = {args.max_tokens} and max sentences per GPU = {None} \n" ) # Measure model latency, the program will exit after profiling latency if args.latcpu or args.latgpu: utils.measure_latency(args, model, dummy_src_tokens, dummy_prev) exit(0) # Load the latest checkpoint if one is available and restore the corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Evaluate the SubTransformer if args.validate_subtransformer: config = utils.get_subtransformer_config(args) trainer.set_sample_config(config) valid_loss = validate(args, trainer, task, epoch_itr, ['valid'], 'SubTransformer') print(f"| SubTransformer validation loss:{valid_loss}") # Loop boundaries max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') represent_configs = utils.get_represent_configs(args) # Main training loop while lr > args.stop_min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: for k, v in represent_configs.items(): trainer.set_sample_config(config=v) valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets, sampled_arch_name=k) else: valid_losses = [None] # update the best loss and get current lr; the real lr scheduling is done in trainer.train_step() lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint epoch level if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) train_meter.stop() print('| Done training in {:.1f} seconds'.format(train_meter.sum))
def _main(args, output_file): logging.basicConfig( format='%(asctime)s | %(levelname)s | %(name)s | %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, stream=output_file, ) logger = logging.getLogger('fairnr_cli.render') utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Load ensemble logger.info('loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(os.pathsep), arg_overrides=eval(args.model_overrides.replace("\\", "")), task=task, ) # Optimize ensemble for generation for model in models: if args.fp16: model.half() if use_cuda: model.cuda() logging.info(model) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, seed=args.seed, num_workers=args.num_workers).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) shard_id, world_size = args.distributed_rank, args.distributed_world_size output_files = [] if generator.test_poses is not None: total_frames = generator.test_poses.shape[0] _frames = int(np.floor(total_frames / world_size)) step = shard_id * _frames frames = _frames if shard_id < (world_size - 1) else total_frames - step else: step = shard_id * args.render_num_frames frames = args.render_num_frames with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for i, sample in enumerate(t): sample = utils.move_to_cuda(sample) if use_cuda else sample gen_timer.start() step, _output_files = task.inference_step(generator, models, [sample, step, frames]) output_files += _output_files gen_timer.stop(500) wps_meter.update(500) t.log({'wps': round(wps_meter.avg)}) timestamp = generator.save_images( output_files, steps='shard{}'.format(shard_id), combine_output=args.render_combine_output) # join videos from all GPUs and delete temp files try: timestamps = distributed_utils.all_gather_list(timestamp) except: timestamps = [timestamp] if shard_id == 0: generator.merge_videos(timestamps)
def train( args, extra_state: Dict[str, Any], trainer, task, epoch_itr, output_queue: Optional[mp_queues.Queue] = None, **train_step_kwargs, ): # offset for current epoch (may be different from checkpoint offset) starting_offset = extra_state["batch_offset"] # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() stop_training_mid_epoch = False stop_training_end_of_epoch = False do_prune = args.pruning_percentile > 0 if do_prune: prune_masks = create_prune_masks(args, trainer) apply_prune_masks(prune_masks, trainer) while lr > args.min_lr and extra_state["epoch"] <= max_epoch: """Train the model for one epoch.""" itr, progress, extra_meters = setup_epoch(args=args, epoch_itr=epoch_itr, trainer=trainer) for i, samples in enumerate(progress, start=starting_offset): clear_per_step_extra_state(extra_state) try: log_output = trainer.train_step(samples, **train_step_kwargs) # Fairseq's fp16_trainer raises this uncommon error to indicate # that we should stop training. except FloatingPointError as e: print(f"Stopping training due to: {e}.") stop_training_mid_epoch = True break if do_prune: apply_prune_masks(prune_masks, trainer) if i == starting_offset: # ignore the first mini-batch in words-per-second calculation trainer.get_meter("wps").reset() num_updates = trainer.get_num_updates() do_eval_tune_loss = (args.subepoch_validate_interval > 0 and num_updates % args.subepoch_validate_interval == 0) do_save = (not args.no_save and args.save_interval_updates > 0 and num_updates % args.save_interval_updates == 0) do_eval_bleu = ( # We can only do BLEU eval when we have a new checkpoint to load. do_save and args.generate_bleu_eval_interval > 0 and num_updates - extra_state["tune_bleu"]["last_eval_step"] >= args.generate_bleu_eval_interval) if do_eval_bleu: extra_state["tune_bleu"]["last_eval_step"] = num_updates extra_state["batch_offset"] = i + 1 (extra_state, stop_training_mid_epoch, translation_samples) = save_and_eval( args=args, trainer=trainer, task=task, extra_state=extra_state, do_eval_tune_loss=do_eval_tune_loss, do_save=do_save, do_eval_bleu=do_eval_bleu, ) # This should come after save_and_eval. Even if log_output is None, # meaning that there was an overflow, We should still run # save_and_eval to sync all_reduce and then skip the batch. if log_output is None: # This indicates that the batch was skipped, typically # because of OOM or FP16 overflow. continue train_stats = log_mid_epoch_stats( trainer=trainer, progress=progress, extra_meters=extra_meters, log_output=log_output, ) if distributed_utils.is_master(args) and output_queue is not None: output_queue.put_nowait(( trainer.get_num_updates(), { "train_ppl": train_stats["ppl"], "tune_ppl": extra_state["tune_eval"]["perplexity"], "tune_bleu": extra_state["tune_bleu"]["current"], "translation_samples": translation_samples, }, )) if (do_eval_bleu and args.shrink_lr_no_best_bleu_eval > 0 and extra_state["tune_bleu"]["num_since_best"] > args.shrink_lr_no_best_bleu_eval): current_lr = trainer.optimizer.get_lr() trainer.optimizer.set_lr(current_lr * args.lr_shrink) lr = trainer.optimizer.get_lr() print(f"Decayed lr from {current_lr} to {lr}.") stop_training_mid_epoch = (stop_training_mid_epoch or is_training_over_time_limit( extra_state["start_time"], args.stop_time_hr)) if stop_training_mid_epoch: break # log end-of-epoch stats train_stats = log_end_epoch_stats(trainer=trainer, progress=progress, extra_meters=extra_meters) # Run a training step if not stopping mid-epoch. if not stop_training_mid_epoch: # batch_offset being None denotes the end of an epoch. extra_state["batch_offset"] = None ( extra_state, stop_training_end_of_epoch, translation_samples, ) = save_and_eval( args=args, trainer=trainer, task=task, extra_state=extra_state, do_eval_tune_loss=True, do_save=not args.no_save and not args.no_end_of_epoch_checkpoints, do_eval_bleu=args.generate_bleu_eval_per_epoch, ) if distributed_utils.is_master(args) and output_queue is not None: output_queue.put_nowait(( trainer.get_num_updates(), { "train_ppl": train_stats["ppl"], "tune_ppl": extra_state["tune_eval"]["perplexity"], "tune_bleu": extra_state["tune_bleu"]["current"], "translation_samples": translation_samples, }, )) if stop_training_mid_epoch or stop_training_end_of_epoch: break lr = trainer.lr_step(extra_state["epoch"], extra_state["tune_eval"]["loss"]) extra_state["epoch"] += 1 extra_state["batch_offset"] = 0 starting_offset = 0 train_meter.stop() print(f"| done training in {train_meter.sum:.1f} seconds") print(f"| Best BLEU score of {extra_state['tune_bleu']['best']} was from " f"epoch {extra_state['tune_bleu']['best_epoch']}") # Put None in the queue to indicate to the consumer that training # has finished. if distributed_utils.is_master(args) and output_queue is not None: output_queue.put_nowait(None)
def train(args, extra_state, trainer, task, epoch_itr): # offset for current epoch (may be different from checkpoint offset) starting_offset = extra_state["batch_offset"] # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() stop_training_mid_epoch = False stop_training_end_of_epoch = False do_prune = args.pruning_percentile > 0 if do_prune: prune_masks = create_prune_masks(args, trainer) apply_prune_masks(prune_masks, trainer) # update parameters every N batches if epoch_itr.epoch <= len(args.update_freq): update_freq = args.update_freq[epoch_itr.epoch - 1] else: update_freq = args.update_freq[-1] num_batches = len(epoch_itr) while lr > args.min_lr and extra_state["epoch"] <= max_epoch: """Train the model for one epoch.""" itr, progress, extra_meters = setup_epoch( args=args, epoch_itr=epoch_itr, trainer=trainer ) for i, sample in enumerate(progress, start=starting_offset): if i < num_batches - 1 and (i + 1) % update_freq > 0: # buffer updates according to --update-freq trainer.train_step(sample, update_params=False) continue else: log_output = trainer.train_step(sample, update_params=True) if do_prune: apply_prune_masks(prune_masks, trainer) train_stats = log_mid_epoch_stats( trainer=trainer, progress=progress, extra_meters=extra_meters, log_output=log_output, ) if i == starting_offset: # ignore the first mini-batch in words-per-second calculation trainer.get_meter("wps").reset() num_updates = trainer.get_num_updates() do_validate = ( args.subepoch_validate_interval > 0 and num_updates % args.subepoch_validate_interval == 0 ) do_save = ( not args.no_save and args.save_interval_updates > 0 and num_updates % args.save_interval_updates == 0 ) do_eval_bleu = ( # We can only do BLEU eval when we have a new checkpoint to load. do_save and args.generate_bleu_eval_interval > 0 and num_updates - extra_state["last_bleu_eval"] >= args.generate_bleu_eval_interval ) if do_eval_bleu: extra_state["last_bleu_eval"] = num_updates extra_state["batch_offset"] = i + 1 ( _, val_ppl, val_bleu, stop_training_mid_epoch, translation_samples, lr, ) = validate_save_and_evaluate_bleu( args=args, trainer=trainer, task=task, extra_state=extra_state, do_validate=do_validate, do_save=do_save, do_eval_bleu=do_eval_bleu, ) yield ( trainer.get_num_updates(), { "train_ppl": train_stats["ppl"], "tune_ppl": val_ppl, "tune_bleu": val_bleu, "translation_samples": translation_samples, }, ) stop_training_mid_epoch = ( stop_training_mid_epoch or is_training_over_time_limit( extra_state["start_time"], args.stop_time_hr ) ) if stop_training_mid_epoch: break # log end-of-epoch stats train_stats = log_end_epoch_stats( trainer=trainer, progress=progress, extra_meters=extra_meters ) # Run a training step if not stopping mid-epoch. if not stop_training_mid_epoch: # batch_offset being None denotes the end of an epoch. extra_state["batch_offset"] = None ( val_loss, val_ppl, val_bleu, stop_training_end_of_epoch, translation_samples, lr, ) = validate_save_and_evaluate_bleu( args=args, trainer=trainer, task=task, extra_state=extra_state, do_validate=True, do_save=not args.no_save and not args.no_end_of_epoch_checkpoints, do_eval_bleu=args.generate_bleu_eval_per_epoch, ) extra_state["val_loss"] = val_loss yield ( trainer.get_num_updates(), { "train_ppl": train_stats["ppl"], "tune_ppl": val_ppl, "tune_bleu": val_bleu, "translation_samples": translation_samples, }, ) if stop_training_mid_epoch or stop_training_end_of_epoch: break lr = trainer.lr_step(extra_state["epoch"], val_loss) extra_state["epoch"] += 1 extra_state["batch_offset"] = 0 starting_offset = 0 train_meter.stop() print(f"| done training in {train_meter.sum:.1f} seconds") print( f"| Best BLEU score of {extra_state['evaluate_bleu']['best']} was from " f"epoch {extra_state['evaluate_bleu']['best_epoch']}" )
def _main(args, output_file): logging.basicConfig( format='%(asctime)s | %(levelname)s | %(name)s | %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, stream=output_file, ) logger = logging.getLogger('fairseq_cli.generate') utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble logger.info('loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(os.pathsep), arg_overrides=eval(args.model_overrides), task=task, ) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) # Generate and compute BLEU score if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) num_sentences = 0 has_target = True with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad( sample['target'][i, :], tgt_dict.pad()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( args.gen_subset).tgt.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) else: src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) if not args.quiet: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str), file=output_file) if has_target: print('T-{}\t{}'.format(sample_id, target_str), file=output_file) # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) if not args.quiet: score = hypo['score'] / math.log( 2) # convert to base 2 print('H-{}\t{}\t{}'.format(sample_id, score, hypo_str), file=output_file) print( 'P-{}\t{}'.format( sample_id, ' '.join( map( lambda x: '{:.4f}'.format(x), # convert from base e to base 2 hypo['positional_scores'].div_( math.log(2)).tolist(), ))), file=output_file) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join([ '{}-{}'.format(src_idx, tgt_idx) for src_idx, tgt_idx in alignment ])), file=output_file) if args.print_step: print('I-{}\t{}'.format(sample_id, hypo['steps']), file=output_file) if getattr(args, 'retain_iter_history', False): for step, h in enumerate(hypo['history']): _, h_str, _ = utils.post_process_prediction( hypo_tokens=h['tokens'].int().cpu(), src_str=src_str, alignment=None, align_dict=None, tgt_dict=tgt_dict, remove_bpe=None, ) print('E-{}_{}\t{}'.format( sample_id, step, h_str), file=output_file) # Score only the top hypothesis if has_target and j == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True) if hasattr(scorer, 'add_string'): scorer.add_string(target_str, hypo_str) else: scorer.add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) t.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] logger.info('NOTE: hypothesis and token scores are output in base 2') logger.info( 'Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: logger.info('Generate {} with beam={}: {}'.format( args.gen_subset, args.beam, scorer.result_string())) return scorer
def main(args, init_distributed=False): utils.import_user_module(args) try: from fairseq.fb_pathmgr import fb_pathmgr global fb_pathmgr_registerd if not fb_pathmgr_registerd: fb_pathmgr.register() fb_pathmgr_registerd = True except (ModuleNotFoundError, ImportError): pass assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max input frames per GPU = {} and max sentences per GPU = {}'. format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) if callable(getattr(trainer.criterion, 'set_train_tgt_dataset', None)): trainer.criterion.set_train_tgt_dataset( task.dataset(args.train_subset).tgt) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') while (lr > args.min_lr and (epoch_itr.epoch < max_epoch or (epoch_itr.epoch == max_epoch and epoch_itr._next_epoch_itr is not None)) and trainer.get_num_updates() < max_update): # train for one epoch train(args, trainer, task, epoch_itr) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation wer to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) reload_dataset = len(args.train_feat_files) > 1 # sharded data: get train iterator for next epoch epoch_itr = trainer.get_train_iterator(epoch_itr.epoch, load_dataset=reload_dataset) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args): if args.max_tokens is None: args.max_tokens = 6000 print(args) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(args, task, ['train', 'valid']) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format(sum(p.numel() for p in model.parameters()))) # Build trainer if args.fp16: trainer = FP16Trainer(args, task, model, criterion) else: if torch.cuda.get_device_capability(0)[0] >= 7: print('| NOTICE: your device may support faster training with --fp16') trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Initialize dataloader max_positions = trainer.get_model().max_positions() epoch_itr = data.EpochBatchIterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences_valid, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ) # Load the latest checkpoint if one is available # load_checkpoint(args, trainer, epoch_itr) # unset grads for old params trainer.unset_parent_model_param() trainer._build_optimizer() # load checkpoint later, so that trainer will _build_optimizer first load_checkpoint(args, trainer, epoch_itr) print('###Start Printing the parameters: ') for name, param in trainer.model.named_parameters(): print(name, param.requires_grad) # Send a dummy batch to warm the caching allocator dummy_batch = task.dataset('train').get_dummy_batch(args.max_tokens, max_positions) trainer.dummy_train_step(dummy_batch) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates() < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) print('Done %d Epochs' % epoch_itr.epoch) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(): parser = options.get_parser('Trainer') dataset_args = options.add_dataset_args(parser) dataset_args.add_argument('--max-tokens', default=6000, type=int, metavar='N', help='maximum number of tokens in a batch') dataset_args.add_argument('--max-sentences', type=int, metavar='N', help='maximum number of sentences in a batch') dataset_args.add_argument( '--train-subset', default='train', metavar='SPLIT', choices=['train', 'valid', 'test'], help='data subset to use for training (train, valid, test)') dataset_args.add_argument( '--valid-subset', default='valid', metavar='SPLIT', help='comma separated list of data subsets ' ' to use for validation (train, valid, valid1,test, test1)') dataset_args.add_argument( '--max-sentences-valid', type=int, metavar='N', help='maximum number of sentences in a validation batch') options.add_optimization_args(parser) options.add_checkpoint_args(parser) options.add_model_args(parser) args = utils.parse_args_and_arch(parser) if args.no_progress_bar and args.log_format is None: args.log_format = 'simple' if args.max_sentences_valid is None: args.max_sentences_valid = args.max_sentences if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) torch.manual_seed(args.seed) # Load dataset splits = ['train', 'valid'] if data.has_binary_files(args.data, splits): dataset = data.load_dataset(args.data, splits, args.source_lang, args.target_lang) else: dataset = data.load_raw_text_dataset(args.data, splits, args.source_lang, args.target_lang) if args.source_lang is None or args.target_lang is None: # record inferred languages in args, so that it's saved in checkpoints args.source_lang, args.target_lang = dataset.src, dataset.dst if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') args.num_gpus = torch.cuda.device_count() print(args) print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) for split in splits: print('| {} {} {} examples'.format(args.data, split, len(dataset.splits[split]))) print( '| using {} GPUs (with max tokens per GPU = {} and max sentences per GPU = {})' .format(args.num_gpus, args.max_tokens, args.max_sentences)) # Build model and criterion model = utils.build_model(args, dataset.src_dict, dataset.dst_dict) criterion = utils.build_criterion(args, dataset.src_dict, dataset.dst_dict) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format( sum(p.data.numel() for p in model.parameters()))) # The max number of positions can be different for train and valid # e.g., RNNs may support more positions at test time than seen in training max_positions_train = (min(args.max_source_positions, model.max_encoder_positions()), min(args.max_target_positions, model.max_decoder_positions())) max_positions_valid = (model.max_encoder_positions(), model.max_decoder_positions()) # Start multiprocessing trainer = MultiprocessingTrainer(args, model, criterion) # Create files to save losses traincsv_path = os.path.join(args.save_dir, 'train_losses.csv') validcsv_path = os.path.join(args.save_dir, 'valid_losses.csv') output_path = [traincsv_path, validcsv_path] for path in output_path: with open(path, 'w+') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',') csvwriter.writerow(['Epoch', 'Perplexity', 'Loss']) csvfile.close() # Load the latest checkpoint if one is available checkpoint_path = os.path.join(args.save_dir, args.restore_file) extra_state = trainer.load_checkpoint(checkpoint_path) if extra_state is not None: epoch = extra_state['epoch'] batch_offset = extra_state['batch_offset'] print('| loaded checkpoint {} (epoch {})'.format( checkpoint_path, epoch)) if batch_offset == 0: epoch += 1 else: epoch, batch_offset = 1, 0 # Train until the learning rate gets too small val_loss = None max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() while lr > args.min_lr and epoch <= max_epoch: # train for one epoch train(args, epoch, batch_offset, trainer, dataset, max_positions_train, traincsv_path) # evaluate on validate set for k, subset in enumerate(args.valid_subset.split(',')): val_loss = validate(args, epoch, trainer, dataset, max_positions_valid, subset, validcsv_path) if k == 0: if not args.no_save: # save checkpoint save_checkpoint(trainer, args, epoch, 0, val_loss) # only use first validation loss to update the learning schedule lr = trainer.lr_step(val_loss, epoch) epoch += 1 batch_offset = 0 train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum)) # Stop multiprocessing trainer.stop()
def main(parsed_args): assert parsed_args.path is not None, '--path required for evaluation!' import_user_module(parsed_args) print(parsed_args) use_cuda = torch.cuda.is_available() and not parsed_args.cpu task = tasks.setup_task(parsed_args) # Load ensemble print('| loading model(s) from {}'.format(parsed_args.path)) models, args = utils.load_ensemble_for_inference( parsed_args.path.split(':'), task, model_arg_overrides=eval(parsed_args.model_overrides), ) for arg in vars(parsed_args).keys(): if arg not in { 'self_target', 'future_target', 'past_target', 'tokens_per_sample', 'output_size_dictionary' }: setattr(args, arg, getattr(parsed_args, arg)) task = tasks.setup_task(args) # Load dataset splits task.load_dataset(args.gen_subset) print('| {} {} {} examples'.format(args.data, args.gen_subset, len(task.dataset(args.gen_subset)))) # Optimize ensemble for generation and set the source and dest dicts on the model (required by scorer) for model in models: model.make_generation_fast_() if args.fp16: model.half() if use_cuda: model.cuda() assert len(models) > 0 print('num. model params: {}'.format( sum(p.numel() for p in models[0].parameters()))) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens or 36000, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( *[model.max_positions() for model in models]), ignore_invalid_inputs=True, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) gen_timer = StopwatchMeter() scorer = SequenceScorer(task.target_dictionary) score_sum = 0. count = 0 if args.remove_bpe is not None: bpe_cont = args.remove_bpe.rstrip() bpe_toks = set(i for i in range(len(task.dictionary)) if task.dictionary[i].endswith(bpe_cont)) bpe_len = len(bpe_cont) else: bpe_toks = None bpe_len = 0 word_stats = dict() with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue gen_timer.start() hypos = scorer.generate(models, sample) gen_timer.stop(sample['ntokens']) for hypos_i in hypos: hypo = hypos_i[0] pos_scores = hypo['positional_scores'] skipped_toks = 0 if bpe_toks is not None: for i in range(len(hypo['tokens']) - 1): if hypo['tokens'][i].item() in bpe_toks: skipped_toks += 1 pos_scores[i + 1] += pos_scores[i] pos_scores[i] = 0 inf_scores = pos_scores.eq(float('inf')) | pos_scores.eq( float('-inf')) if inf_scores.any(): print( '| Skipping tokens with inf scores:', task.target_dictionary.string( hypo['tokens'][inf_scores.nonzero()])) pos_scores = pos_scores[(~inf_scores).nonzero()] score_sum += pos_scores.sum().cpu() count += pos_scores.numel() - skipped_toks if args.output_word_probs or args.output_word_stats: w = '' word_prob = [] is_bpe = False for i in range(len(hypo['tokens'])): w_ind = hypo['tokens'][i].item() w += task.dictionary[w_ind] if bpe_toks is not None and w_ind in bpe_toks: w = w[:-bpe_len] is_bpe = True else: word_prob.append((w, pos_scores[i].item())) next_prob = None ind = i + 1 while ind < len(hypo['tokens']): if pos_scores[ind].item() != 0: next_prob = pos_scores[ind] break ind += 1 word_stats.setdefault(w, WordStat(w, is_bpe)).add( pos_scores[i].item(), next_prob) is_bpe = False w = '' if args.output_word_probs: print('\t'.join('{} [{:2f}]'.format(x[0], x[1]) for x in word_prob)) wps_meter.update(sample['ntokens']) t.log({'wps': round(wps_meter.avg)}) avg_nll_loss = -score_sum / count print('| Evaluated {} tokens in {:.1f}s ({:.2f} tokens/s)'.format( gen_timer.n, gen_timer.sum, 1. / gen_timer.avg)) print('| Loss: {:.4f}, Perplexity: {:.2f}'.format(avg_nll_loss, np.exp(avg_nll_loss))) if args.output_word_stats: for ws in sorted(word_stats.values(), key=lambda x: x.count, reverse=True): print(ws)
def main(args): if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) if args.distributed_world_size > 1: assert (torch.distributed.is_initialized()) torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0) torch.cuda.synchronize() if args.max_tokens is None: args.max_tokens = 6000 print(args) pValue = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int)) result = torch.cuda.cudart().cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128)) result = torch.cuda.cudart().cudaDeviceGetLimit(pValue, ctypes.c_int(0x05)) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(task, ['train', 'valid']) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format( sum(p.numel() for p in model.parameters()))) # Build trainer if args.fp16: trainer = FP16Trainer(args, task, model, criterion) else: if torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) trainer = Trainer(args, task, model, criterion) if (args.online_eval or args.target_bleu) and not args.remove_bpe: args.remove_bpe = '@@ ' print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) max_positions = trainer.get_model().max_positions() epoch_itr = data.EpochBatchIterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences_valid, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ) # Load the latest checkpoint if one is available load_checkpoint(args, trainer, epoch_itr) # Send a dummy batch to warm the caching allocator dummy_batch = task.dataset('train').get_dummy_batch( args.max_tokens, max_positions) trainer.dummy_train_step(dummy_batch) # Train until the learning rate gets too small or model reaches target score max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf tgt_bleu = args.target_bleu or math.inf current_bleu = 0.0 best_bleu = 0.0 lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') while lr >= args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update and current_bleu < tgt_bleu: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # Eval BLEU score if args.online_eval or (not tgt_bleu is math.inf): current_bleu, current_sc_bleu = score(args, trainer, task, epoch_itr, args.gen_subset) if current_bleu > best_bleu: best_bleu = current_bleu save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) # Only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # Save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) try: from fairseq.fb_pathmgr import fb_pathmgr global fb_pathmgr_registerd if not fb_pathmgr_registerd: fb_pathmgr.register() fb_pathmgr_registerd = True except (ModuleNotFoundError, ImportError): pass assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # filter the params that is unused for finetuing, ad-hoc for finetuing, should turn off when bert pretraining. for n, p in model.named_parameters(): if "lm_head" in n: p.requires_grad = False # print(n) # print(n, p.requires_grad, p.shape) # for i, (n, p) in enumerate(model.named_parameters()): # print(i, n, p.size()) # asdf # Build trainer trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') if not hasattr(checkpoint_utils.save_checkpoint, 'not_best'): checkpoint_utils.save_checkpoint.not_best = 0 #import pdb; pdb.set_trace() while epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: print('Start training') # train for one epoch train(args, trainer, task, epoch_itr) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) if args.early_stop > 0: if hasattr( checkpoint_utils.save_checkpoint, 'best' ) and valid_losses[0] > checkpoint_utils.save_checkpoint.best: checkpoint_utils.save_checkpoint.not_best += 1 print("| Not the best ckpt... not best:", checkpoint_utils.save_checkpoint.not_best) if checkpoint_utils.save_checkpoint.not_best > args.early_stop: print("| Early stop...") break else: checkpoint_utils.save_checkpoint.not_best = 0 else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) reload_dataset = ':' in getattr(args, 'data', '') # sharded data: get train iterator for next epoch epoch_itr = trainer.get_train_iterator(epoch_itr.epoch, load_dataset=reload_dataset) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args): if args.max_tokens is None: args.max_tokens = 10240 print(args) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(task, ['train', 'valid']) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format(sum(p.numel() for p in model.parameters()))) # Make a dummy batch to (i) warm the caching allocator and (ii) as a # placeholder DistributedDataParallel when there's an uneven number of # batches per worker. max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) dummy_batch = task.dataset('train').get_dummy_batch(args.max_tokens, max_positions) # Build trainer trainer = Trainer(args, task, model, criterion, dummy_batch) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Initialize dataloader epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ) # Load bert model if one is available if hasattr(args, 'load_bert') and args.load_bert: load_bert_model(args, trainer) # Load the latest checkpoint if one is available if not load_checkpoint(args, trainer, epoch_itr): trainer.dummy_train_step([dummy_batch]) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates() < max_update: # train for one epoch if epoch_itr.epoch > 0: epoch_itr_state = epoch_itr.state_dict() epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed + epoch_itr.epoch, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ) epoch_itr.load_state_dict(epoch_itr_state) train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): import_user_module(args) if args.max_tokens is None: args.max_tokens = 6000 print(args) if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(task, ['train', 'valid']) # Initialize distributed training (after data loading) if init_distributed: import socket args.distributed_rank = distributed_utils.distributed_init(args) print('| initialized host {} as rank {}'.format(socket.gethostname(), args.distributed_rank)) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Make a dummy batch to (i) warm the caching allocator and (ii) as a # placeholder DistributedDataParallel when there's an uneven number of # batches per worker. max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) dummy_batch = task.dataset('train').get_dummy_batch(args.max_tokens, max_positions) oom_batch = task.dataset('train').get_dummy_batch(1, max_positions) # Build trainer trainer = Trainer(args, task, model, criterion, dummy_batch, oom_batch) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Initialize dataloader epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, ) # Load the latest checkpoint if one is available if not load_checkpoint(args, trainer, epoch_itr): trainer.dummy_train_step([dummy_batch]) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates() < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') if len(args.tensorboard_logdir) > 0: os.makedirs(args.tensorboard_logdir, exist_ok=True) with open(os.path.join(args.tensorboard_logdir, 'args_log.txt'), 'w') as f: pprint.pprint(args.__dict__, stream=f) while (lr > args.min_lr and (epoch_itr.epoch < max_epoch # allow resuming training from the final checkpoint or epoch_itr._next_epoch_itr is not None) and trainer.get_num_updates() < max_update): # train for one epoch train(args, trainer, task, epoch_itr) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) # early stop if should_stop_early(args, valid_losses[0]): print( '| Early stop since valid performance hasn\'t improved for last {} runs' .format(args.patience)) break reload_dataset = ':' in getattr(args, 'data', '') # sharded data: get train iterator for next epoch epoch_itr = trainer.get_train_iterator(epoch_itr.epoch, load_dataset=reload_dataset) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def train( args, extra_state: Dict[str, Any], trainer, task, epoch_itr, output_queue: Optional[mp_queues.Queue] = None, **train_step_kwargs, ): # offset for current epoch (may be different from checkpoint offset) starting_offset = extra_state["batch_offset"] # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() stop_training_mid_epoch = False stop_training_end_of_epoch = False do_prune = args.pruning_percentile > 0 if do_prune: prune_masks = create_prune_masks(args, trainer) apply_prune_masks(prune_masks, trainer) while lr > args.min_lr and extra_state["epoch"] <= max_epoch: """Train the model for one epoch.""" itr, progress, extra_meters = setup_epoch(args=args, epoch_itr=epoch_itr, trainer=trainer) for i, samples in enumerate(progress, start=starting_offset): clear_per_step_extra_state(extra_state) extra_state["num_iterations"] = extra_state.get( "num_iterations", 0) + 1 if (train_step_kwargs is not None and "augment_adv" in train_step_kwargs.keys()): train_step_kwargs["augment_adv"] = ( extra_state["num_iterations"] > args.warmup_steps) try: log_output = trainer.train_step(samples, **train_step_kwargs) # Fairseq's fp16_trainer raises this uncommon error to indicate # that we should stop training. except FloatingPointError as e: print(f"Stopping training due to: {e}.") stop_training_mid_epoch = True break if do_prune: apply_prune_masks(prune_masks, trainer) if i == starting_offset: # ignore the first mini-batch in words-per-second calculation trainer.get_meter("wps").reset() # Clear any remaining metrics from previous steps. This should already # have been done before, but just in case - to make sure we catch # any case where extra_case does not get populated correctly. extra_state = clear_per_step_extra_state(extra_state) extra_state["batch_offset"] = i + 1 ( extra_state, stop_training_mid_epoch, translation_samples, ) = evals.save_and_eval(args=args, trainer=trainer, task=task, extra_state=extra_state) # This should come after save_and_eval. Even if log_output is None, # meaning that there was an overflow, We should still run # save_and_eval to sync all_reduce and then skip the batch. if log_output is None: # This indicates that the batch was skipped, typically # because of OOM or FP16 overflow. continue train_stats = evals.log_mid_epoch_stats( trainer=trainer, progress=progress, extra_meters=extra_meters, log_output=log_output, ) if distributed_utils.is_master(args) and output_queue is not None: output_queue.put_nowait(( trainer.get_num_updates(), { "train_ppl": train_stats["ppl"], "tune_ppl": extra_state["tune_eval"]["perplexity"], "tune_bleu": extra_state["tune_bleu"]["current"], # We only report wps at the end of an epoch, since # the meter gets reset at the start of every epoch. "wps": None, "translation_samples": translation_samples, }, )) if (args.save_interval_updates > 0 and extra_state["num_iterations"] % args.save_interval_updates == 0 and args.shrink_lr_no_best_bleu_eval > 0 and extra_state["tune_bleu"]["num_since_best"] > args.shrink_lr_no_best_bleu_eval): current_lr = trainer.optimizer.get_lr() trainer.optimizer.set_lr(current_lr * args.lr_shrink) lr = trainer.optimizer.get_lr() print(f"Decayed lr from {current_lr} to {lr}.") stop_training_mid_epoch = (stop_training_mid_epoch or is_training_over_time_limit( extra_state["start_time"], args.stop_time_hr)) if stop_training_mid_epoch: break # log end-of-epoch stats train_stats = evals.log_end_epoch_stats(trainer=trainer, progress=progress, extra_meters=extra_meters) # batch_offset being None denotes the end of an epoch. extra_state["batch_offset"] = None ( extra_state, stop_training_end_of_epoch, translation_samples, ) = evals.save_and_eval( args=args, trainer=trainer, task=task, extra_state=extra_state, end_of_epoch=True, ) if distributed_utils.is_master(args) and output_queue is not None: output_queue.put_nowait(( trainer.get_num_updates(), { "train_ppl": train_stats["ppl"], "tune_ppl": extra_state["tune_eval"]["perplexity"], "tune_bleu": extra_state["tune_bleu"]["current"], "wps": train_stats["wps"], "translation_samples": translation_samples, }, )) if stop_training_mid_epoch or stop_training_end_of_epoch: break lr = trainer.lr_step(extra_state["epoch"], extra_state["tune_eval"]["loss"]) extra_state["epoch"] += 1 extra_state["batch_offset"] = 0 starting_offset = 0 train_meter.stop() print(f"| done training in {train_meter.sum:.1f} seconds") print(f"| Best BLEU score of {extra_state['tune_bleu']['best']} was from " f"epoch {extra_state['tune_bleu']['best_epoch']}")
def main(args, init_distributed=False): import_user_module(args) if args.max_tokens is None: args.max_tokens = 6000 print(args) if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(args, task) ###get [APPEND] [SRC] [TGT] [SEP] symbol id args.unk_idx = task.src_dict.indices['<unk>'] args.dict_len = task.src_dict.indices.__len__() if '[APPEND]' in task.src_dict.indices.keys(): args.APPEND_ID = task.src_dict.indices['[APPEND]'] print("[APPEND] ID: {}".format(args.APPEND_ID)) else: args.APPEND_ID = -1 if '[SRC]' in task.src_dict.indices.keys(): args.SRC_ID = task.src_dict.indices['[SRC]'] print("[SRC] ID: {}".format(args.SRC_ID)) else: args.SRC_ID = -1 if '[TGT]' in task.src_dict.indices.keys(): args.TGT_ID = task.src_dict.indices['[TGT]'] print("[TGT] ID: {}".format(args.TGT_ID)) else: args.TGT_ID = -1 if '[SEP]' in task.src_dict.indices.keys(): args.SEP_ID = task.src_dict.indices['[SEP]'] print("[SEP] ID: {}".format(args.SEP_ID)) else: args.SEP_ID = -1 if '</s>' in task.src_dict.indices.keys(): args.EOS_ID = task.src_dict.indices['</s>'] else: args.EOD_ID = -1 if '<pad>' in task.src_dict.indices.keys(): args.PAD_ID = task.src_dict.indices['<pad>'] else: args.PAD_ID = -1 # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Make a dummy batch to (i) warm the caching allocator and (ii) as a # placeholder DistributedDataParallel when there's an uneven number of # batches per worker. max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) dummy_batch = task.dataset(args.train_subset).get_dummy_batch( args.max_tokens, max_positions) oom_batch = task.dataset(args.train_subset).get_dummy_batch( 1, max_positions) # Build trainer print("Building trainer...") trainer = Trainer(args, task, model, criterion, dummy_batch, oom_batch) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Initialize dataloader print("Initialize dataloader...") epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=args.required_batch_size_multiple, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, ) # Initialize distributed training (after data loading) print("Initialize distributed training (after data loading)...") if init_distributed: import socket args.distributed_rank = distributed_utils.distributed_init(args) print('| initialized host {} as rank {}'.format( socket.gethostname(), args.distributed_rank)) model.args = args # Load the latest checkpoint if one is available print("Load the latest checkpoint if one is available...") load_checkpoint(args, trainer, epoch_itr) #trainer.dummy_train_step([dummy_batch]) if args.reset_target_embedding: trainer.init_meters(args) print("reset trainer.meters") # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') if args.distributed_rank == 0: if os.path.basename(args.save_dir) != "": log_file = os.path.join( args.save_dir, "({0})-params.log".format(os.path.basename(args.save_dir))) else: log_file = os.path.join( args.save_dir, "({0})-params.log".format(args.save_dir.split('/')[-2])) # create log file args.log_file = log_file if os.path.exists(log_file): print( "It exists log file {}, add log to the file".format(log_file)) w = open(log_file, "a+", encoding="utf-8") else: print("It does not exists log file {}, create log file".format( log_file)) w = open(log_file, "w", encoding="utf-8") w.write(str(args).replace(", ", ",\n") + "\n") w.write(str(model) + "\n") w.flush() w.close() print("saving params file into{}...".format(log_file)) while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args): assert args.path is not None, '--path required for generation!' assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert args.replace_unk is None or args.raw_text, \ '--replace-unk requires a raw text dataset (--raw-text)' import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) print('| {} {} {} examples'.format(args.data, args.gen_subset, len(task.dataset(args.gen_subset)))) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble print('| loading model(s) from {}'.format(args.path)) models, _model_args = utils.load_ensemble_for_inference( args.path.split(':'), task, model_arg_overrides=eval(args.model_overrides), ) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) # Generate and compute BLEU score if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) num_sentences = 0 has_target = True with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad( sample['target'][i, :], tgt_dict.pad()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( args.gen_subset).tgt.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) else: src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) if not args.quiet: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str)) if has_target: print('T-{}\t{}'.format(sample_id, target_str)) # Process top predictions for i, hypo in enumerate( hypos[i][:min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) if not args.quiet: print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], hypo_str)) #print('P-{}\t{}'.format( # sample_id, # ' '.join(map( # lambda x: '{:.4f}'.format(x), # hypo['positional_scores'].tolist(), # )) #)) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join( map(lambda x: str(utils.item(x)), alignment)))) # Score only the top hypothesis if has_target and i == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True) if hasattr(scorer, 'add_string'): scorer.add_string(target_str, hypo_str) else: scorer.add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) t.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] print( '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string())) return scorer
def main(args): assert args.path is not None, '--path required for recognition!' assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset split task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionary dict = task.target_dictionary # Load ensemble print('| loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(':'), arg_overrides=eval(args.model_overrides), task=task, ) for i, m in enumerate(models): if hasattr(m, 'is_wordlm') and m.is_wordlm: # assume subword LM comes before word LM if isinstance(models[i - 1], FairseqLanguageModel): models[i-1] = MultiLevelLanguageModel(m, models[i-1], subwordlm_weight=args.subwordlm_weight, oov_penalty=args.oov_penalty, open_vocab=not args.disable_open_vocab) del models[i] print('| LM fusion with Multi-level LM') else: models[i] = LookAheadWordLanguageModel(m, dict, oov_penalty=args.oov_penalty, open_vocab=not args.disable_open_vocab) print('| LM fusion with Look-ahead Word LM') # assume subword LM comes after E2E models elif i == len(models) - 1 and isinstance(m, FairseqLanguageModel): print('| LM fusion with Subword LM') if args.lm_weight != 0.0: print('| using LM fusion with lm-weight={:.2f}'.format(args.lm_weight)) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment or args.coverage_weight > 0., ) if args.fp16: model.half() if use_cuda: model.cuda() # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() if hasattr(model, 'encoder') \ else (None, model.max_positions()) for model in models] ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=8, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) # Initialize generator if args.match_source_len: print('| The option match_source_len is not applicable to ' 'speech recognition. Ignoring it.') gen_timer = StopwatchMeter() generator = task.build_generator(args) # Generate and compute WER scorer = wer.Scorer(dict, wer_output_filter=args.wer_output_filter) num_sentences = 0 has_target = True with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() hypos = task.inference_step(generator, models, sample, prefix_tokens, lm_weight=args.lm_weight) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None utt_id = sample['utt_id'][i] # Retrieve the original sentences if has_target: target_str = task.dataset(args.gen_subset).tgt.get_original_tokens(sample_id) if not args.quiet: target_sent = dict.tokens_to_sentence(target_str, use_unk_sym=False, bpe_symbol=args.remove_bpe) print('T-{}\t{}'.format(utt_id, target_sent)) # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_str = dict.string(hypo['tokens'].int().cpu()) # not removing bpe at this point if not args.quiet or i == 0: hypo_sent = dict.tokens_to_sentence(hypo_str, bpe_symbol=args.remove_bpe) if not args.quiet: print('H-{}\t{}\t{}'.format(utt_id, hypo_sent, hypo['score'])) # Score and obtain attention only the top hypothesis if j == 0: # src_len x tgt_len attention = hypo['attention'].float().cpu() \ if hypo['attention'] is not None else None if args.print_alignment and attention is not None: save_dir = os.path.join(args.results_path, 'attn_plots') os.makedirs(save_dir, exist_ok=True) plot_attention(attention, hypo_sent, utt_id, save_dir) scorer.add_prediction(utt_id, hypo_str, bpe_symbol=args.remove_bpe) if has_target: scorer.add_evaluation(utt_id, target_str, hypo_str, bpe_symbol=args.remove_bpe) wps_meter.update(num_generated_tokens) t.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] print('| Recognized {} utterances ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'.format( num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if args.print_alignment: print('| Saved attention plots in ' + save_dir) if has_target: assert args.test_text_files is not None scorer.add_ordered_utt_list(*args.test_text_files) os.makedirs(args.results_path, exist_ok=True) fn = 'decoded_char_results.txt' with open(os.path.join(args.results_path, fn), 'w', encoding='utf-8') as f: f.write(scorer.print_char_results()) print('| Decoded char results saved as ' + f.name) fn = 'decoded_results.txt' with open(os.path.join(args.results_path, fn), 'w', encoding='utf-8') as f: f.write(scorer.print_results()) print('| Decoded results saved as ' + f.name) if has_target: header = ' Recognize {} with beam={}: '.format(args.gen_subset, args.beam) fn = 'wer' with open(os.path.join(args.results_path, fn), 'w', encoding='utf-8') as f: res = 'WER={:.2f}%, Sub={:.2f}%, Ins={:.2f}%, Del={:.2f}%'.format( *(scorer.wer())) print('|' + header + res) f.write(res + '\n') print('| WER saved in ' + f.name) fn = 'cer' with open(os.path.join(args.results_path, fn), 'w', encoding='utf-8') as f: res = 'CER={:.2f}%, Sub={:.2f}%, Ins={:.2f}%, Del={:.2f}%'.format( *(scorer.cer())) print('|' + ' ' * len(header) + res) f.write(res + '\n') print('| CER saved in ' + f.name) fn = 'aligned_results.txt' with open(os.path.join(args.results_path, fn), 'w', encoding='utf-8') as f: f.write(scorer.print_aligned_results()) print('| Aligned results saved as ' + f.name) return scorer
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, epoch=0, combine=False) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) # print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates() < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) if ':' in getattr(args, 'data', ''): # sharded data: get train iterator for next epoch epoch_itr = trainer.get_train_iterator(epoch=epoch_itr.epoch) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args): assert args.path is not None, '--path required for generation!' utils.import_user_module(args) print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset, combine=False, epoch=0) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble print('| loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(':'), arg_overrides=eval(args.model_overrides), task=task) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load dataset (possibly sharded) itr = data_utils.get_epoch_iterator( task, task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=None, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, num_workers=args.num_workers, seed=args.seed).next_epoch_itr(shuffle=False) # Initialize gen_timer gen_timer = StopwatchMeter() with progress_bar.build_progress_bar(args, itr) as t: for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue src_out, tgt_out = [], [] gen_timer.start() for model in models: model.eval() with torch.no_grad(): s_out, t_out = model.forward(**sample['net_input']) src_out.append(s_out) tgt_out.append(t_out) gen_timer.stop() src_out = sum(src_out) / len(src_out) tgt_out = sum(tgt_out) / len(tgt_out) valid_idx = sample['valid_split']['valid_indices'] dist = torch.cdist(src_out.detach()[valid_idx], tgt_out.detach()[valid_idx], p=1) fo, fl = None, None if args.output_file: fo = open(args.output_file, 'w', encoding='utf-8') fo.write("k acc\n") if args.log_file: fl = open(args.log_file, 'w', encoding='utf-8') # Load MUSE embeddings if available src_embed, tgt_embed = None, None if args.source_embed_path and args.target_embed_path: with open(args.source_embed_path, 'r') as f: first_line = f.readline() embed_dim = int(first_line.rstrip().split()[1]) src_d = task.source_dictionary src_symbol = [ "".join( src_d.string( utils.strip_pad( sample['net_input']['src_tokens'][i, :], src_d.pad()), args.remove_bpe).split()) for i in range(sample['nsentences']) ] tgt_d = task.target_dictionary tgt_symbol = [ "".join( tgt_d.string( utils.strip_pad( sample['net_input']['tgt_tokens'][i, :], tgt_d.pad()), args.remove_bpe).split()) for i in range(sample['nsentences']) ] def build_dict(symbol_list): d = Dictionary() for symbol in symbol_list: d.add_symbol(symbol) return d src_dict = build_dict(src_symbol) tgt_dict = build_dict(tgt_symbol) src_embed = data_utils.build_embedding( src_dict, embed_dim, path=args.source_embed_path) tgt_embed = data_utils.build_embedding( tgt_dict, embed_dim, path=args.target_embed_path) src_sem_out = [ src_embed(torch.tensor(src_dict.index(s))).unsqueeze(0) for s in src_symbol ] src_sem_out = torch.cat(src_sem_out, dim=0).to(dist.device) tgt_sem_out = [ tgt_embed(torch.tensor(tgt_dict.index(s))).unsqueeze(0) for s in tgt_symbol ] tgt_sem_out = torch.cat(tgt_sem_out, dim=0).to(dist.device) sdist = torch.cdist(src_sem_out[valid_idx], tgt_sem_out[valid_idx], p=1) dim = src_out.size(1) for k in args.k: best_acc = -1 best_b = -1 for b in [1e-4 * t for t in range(10001)]: fdist = dist / dim * b + sdist / embed_dim * (1 - b) if args.reverse: fdist = fdist.t() indices = fdist.topk(k=k, dim=1, largest=False, sorted=True).indices correct = torch.arange(len(valid_idx)).to( indices.device) ncorrect = torch.sum( torch.any(indices == correct.unsqueeze(1), dim=1)).cpu().numpy() accuracy = ncorrect / len(valid_idx) if accuracy > best_acc: best_acc = accuracy best_b = b print('| Accuracy at k={:d}, beta={:.2f} : {:.5f}'.format( k, best_b, best_acc)) else: for k in args.k: if args.reverse: dist = dist.t() indices = dist.topk(k=k, dim=1, largest=False, sorted=True).indices correct = torch.arange(len(valid_idx)).to(indices.device) ncorrect = torch.sum( torch.any(indices == correct.unsqueeze(1), dim=1)).detach().cpu().numpy() accuracy = ncorrect / len(valid_idx) if fo: fo.write("{:d} {:.6f}\n".format(k, accuracy)) print('| Accuracy at k={}: {:.5f}'.format(k, accuracy))
def main(args): if args.max_tokens is None: args.max_tokens = 6000 print(args) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(task, ['train', 'valid']) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format(sum(p.numel() for p in model.parameters()))) # Make a dummy batch to (i) warm the caching allocator and (ii) as a # placeholder DistributedDataParallel when there's an uneven number of # batches per worker. max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) dummy_batch = task.dataset('train').get_dummy_batch(args.max_tokens, max_positions) # Build trainer trainer = Trainer(args, task, model, criterion, dummy_batch) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Initialize dataloader epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ) # Load the latest checkpoint if one is available if not load_checkpoint(args, trainer, epoch_itr): trainer.dummy_train_step([dummy_batch]) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates() < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(): parser = options.get_parser('Trainer') dataset_args = options.add_dataset_args(parser) dataset_args.add_argument('--max-tokens', default=6000, type=int, metavar='N', help='maximum number of tokens in a batch') dataset_args.add_argument('--max-sentences', type=int, metavar='N', help='maximum number of sentences in a batch') dataset_args.add_argument('--train-subset', default='train', metavar='SPLIT', choices=['train', 'valid', 'test'], help='data subset to use for training (train, valid, test)') dataset_args.add_argument('--valid-subset', default='valid', metavar='SPLIT', help='comma separated list of data subsets ' ' to use for validation (train, valid, valid1,test, test1)') options.add_optimization_args(parser) options.add_checkpoint_args(parser) options.add_model_args(parser) args = utils.parse_args_and_arch(parser) if args.no_progress_bar and args.log_format is None: args.log_format = 'simple' if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) torch.manual_seed(args.seed) # Load dataset splits = ['train', 'valid'] if data.has_binary_files(args.data, splits): dataset = data.load_dataset(args.data, splits, args.source_lang, args.target_lang) else: dataset = data.load_raw_text_dataset(args.data, splits, args.source_lang, args.target_lang) if args.source_lang is None or args.target_lang is None: # record inferred languages in args, so that it's saved in checkpoints args.source_lang, args.target_lang = dataset.src, dataset.dst print(args) print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) for split in splits: print('| {} {} {} examples'.format(args.data, split, len(dataset.splits[split]))) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') num_gpus = torch.cuda.device_count() print('| using {} GPUs (with max tokens per GPU = {} and max sentences per GPU = {})'.format( num_gpus, args.max_tokens, args.max_sentences)) # Build model and criterion model = utils.build_model(args, dataset.src_dict, dataset.dst_dict) criterion = utils.build_criterion(args, dataset.src_dict, dataset.dst_dict) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) # The max number of positions can be different for train and valid # e.g., RNNs may support more positions at test time than seen in training max_positions_train = (args.max_source_positions, args.max_target_positions) max_positions_valid = ( min(args.max_source_positions, model.max_encoder_positions()), min(args.max_target_positions, model.max_decoder_positions()) ) # Start multiprocessing trainer = MultiprocessingTrainer(args, model, criterion) # Load the latest checkpoint if one is available checkpoint_path = os.path.join(args.save_dir, args.restore_file) extra_state = trainer.load_checkpoint(checkpoint_path) if extra_state is not None: epoch = extra_state['epoch'] batch_offset = extra_state['batch_offset'] print('| loaded checkpoint {} (epoch {})'.format(checkpoint_path, epoch)) if batch_offset == 0: epoch += 1 else: epoch, batch_offset = 1, 0 # Train until the learning rate gets too small val_loss = None max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() while lr > args.min_lr and epoch <= max_epoch: # train for one epoch train(args, epoch, batch_offset, trainer, dataset, max_positions_train, num_gpus) # evaluate on validate set for k, subset in enumerate(args.valid_subset.split(',')): val_loss = validate(args, epoch, trainer, dataset, max_positions_valid, subset, num_gpus) if k == 0: if not args.no_save: # save checkpoint save_checkpoint(trainer, args, epoch, 0, val_loss) # only use first validation loss to update the learning schedule lr = trainer.lr_step(val_loss, epoch) epoch += 1 batch_offset = 0 train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum)) # Stop multiprocessing trainer.stop()