def validate(args, trainer, task, epoch_itr, subsets): """Evaluate the model on the validation set(s) and return the losses.""" valid_losses = [] for subset in subsets: # Initialize data iterator itr = task.get_batch_iterator( dataset=task.dataset(subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences_valid, max_positions=utils.resolve_max_positions( task.max_positions(), trainer.get_model().max_positions(), ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ).next_epoch_itr(shuffle=False) progress = progress_bar.build_progress_bar( args, itr, epoch_itr.epoch, prefix='valid on \'{}\' subset'.format(subset), no_progress_bar='simple' ) # reset validation loss meters for k in ['valid_loss', 'valid_nll_loss']: meter = trainer.get_meter(k) if meter is not None: meter.reset() extra_meters = collections.defaultdict(lambda: AverageMeter()) for sample in progress: log_output = trainer.valid_step(sample) for k, v in log_output.items(): if k in ['loss', 'nll_loss', 'ntokens', 'nsentences', 'sample_size']: continue extra_meters[k].update(v) # log validation stats stats = get_valid_stats(trainer) for k, meter in extra_meters.items(): stats[k] = meter.avg progress.print(stats) valid_losses.append(stats['valid_loss']) return valid_losses
def get_dummy_batch(self, num_tokens, max_positions, src_len=128, tgt_len=128): """Return a dummy batch with a given number of tokens.""" src_len, tgt_len = utils.resolve_max_positions( (src_len, tgt_len), max_positions, (self.max_source_positions, self.max_target_positions), ) bsz = num_tokens // max(src_len, tgt_len) return self.collater([ { 'id': i, 'source': self.src_dict.dummy_sentence(src_len), 'target': self.tgt_dict.dummy_sentence(tgt_len) if self.tgt_dict is not None else None, } for i in range(bsz) ])
def _generate_score(models, args, task, dataset, modify_target_dict): use_cuda = torch.cuda.is_available() and not args.cpu # Load ensemble if not args.quiet: print( "| loading model(s) from {}".format( ", ".join(args.path.split(CHECKPOINT_PATHS_DELIMITER)) ) ) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=True, ) translator = build_sequence_generator(args, task, models) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) print("seed number is" + str(args.max_examples_to_evaluate_seed)) if args.max_examples_to_evaluate > 0: pytorch_translate_data.subsample_pair_dataset( dataset, args.max_examples_to_evaluate, args.max_examples_to_evaluate_seed ) # Keep track of translations # Initialize with empty translations # and zero probs scores translated_sentences = [""] * len(dataset) translated_scores = [0.0] * len(dataset) hypos_list = [] collect_output_hypos = getattr(args, "output_hypos_binary_path", False) if collect_output_hypos: output_hypos_token_arrays = [None] * len(dataset) # Generate and compute BLEU score dst_dict = task.target_dictionary if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk()) itr = task.get_batch_iterator( dataset=dataset, max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models] ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=8, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) oracle_scorer = None if args.report_oracle_bleu: oracle_scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk()) rescorer = None num_sentences = 0 translation_samples = [] translation_info_list = [] with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() gen_timer = StopwatchMeter() translations = translator.generate_batched_itr( t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda=use_cuda, timer=gen_timer, prefix_size=1 if pytorch_translate_data.is_multilingual_many_to_one(args) else 0, ) for trans_info in _iter_translations( args, task, dataset, translations, align_dict, rescorer, modify_target_dict ): if hasattr(scorer, "add_string"): scorer.add_string(trans_info.target_str, trans_info.hypo_str) else: scorer.add(trans_info.target_tokens, trans_info.hypo_tokens) if oracle_scorer is not None: oracle_scorer.add(trans_info.target_tokens, trans_info.best_hypo_tokens) if getattr(args, "translation_output_file", False): translated_sentences[trans_info.sample_id] = trans_info.hypo_str if getattr(args, "translation_probs_file", False): translated_scores[trans_info.sample_id] = trans_info.hypo_score if getattr(args, "hypotheses_export_path", False): hypos_list.append(trans_info.hypos) if collect_output_hypos: output_hypos_token_arrays[ trans_info.sample_id ] = trans_info.best_hypo_tokens if args.translation_info_export_path is not None: # Strip expensive data from hypotheses before saving hypos = [ {k: v for k, v in hypo.items() if k in ["tokens", "score"]} for hypo in trans_info.hypos ] # Make sure everything is on cpu before exporting hypos = [ {"score": hypo["score"], "tokens": hypo["tokens"].cpu()} for hypo in hypos ] translation_info_list.append( { "src_tokens": trans_info.src_tokens.cpu(), "target_tokens": trans_info.target_tokens, "hypos": hypos, } ) translation_samples.append( collections.OrderedDict( { "sample_id": trans_info.sample_id.item(), "src_str": trans_info.src_str, "target_str": trans_info.target_str, "hypo_str": trans_info.hypo_str, } ) ) wps_meter.update(trans_info.src_tokens.size(0)) t.log({"wps": round(wps_meter.avg)}) num_sentences += 1 # If applicable, save collected hypothesis tokens to binary output file if collect_output_hypos: output_dataset = pytorch_translate_data.InMemoryIndexedDataset() output_dataset.load_from_sequences(output_hypos_token_arrays) output_dataset.save(args.output_hypos_binary_path) if args.output_source_binary_path: dataset.src.save(args.output_source_binary_path) if args.translation_info_export_path is not None: f = open(args.translation_info_export_path, "wb") pickle.dump(translation_info_list, f) f.close() # If applicable, save the translations and scores to the output files # These two ouputs are used in dual learning for weighted backtranslation if getattr(args, "translation_output_file", False) and getattr( args, "translation_probs_file", False ): with open(args.translation_output_file, "w") as translation_file, open( args.translation_probs_file, "w" ) as score_file: for hypo_str, hypo_score in zip(translated_sentences, translated_scores): if len(hypo_str.strip()) > 0: print(hypo_str, file=translation_file) print(np.exp(hypo_score), file=score_file) # For eg. external evaluation if getattr(args, "hypotheses_export_path", False): with open(args.hypotheses_export_path, "w") as out_file: for hypos in hypos_list: for hypo in hypos: print( task.tgt_dict.string( hypo["tokens"], bpe_symbol=args.remove_bpe ), file=out_file, ) if oracle_scorer is not None: print(f"| Oracle BLEU (best hypo in beam): {oracle_scorer.result_string()}") return scorer, num_sentences, gen_timer, translation_samples
from fairseq.criterions import LabelSmoothedLengthGan_CrossEntropyCriterion from fairseq.optim import FairseqAdam from fairseq.optim.lr_scheduler import InverseSquareRootSchedule args.use_gpu = False translation_self = TranslationSelfDataset.load_dictionary(args) valid_dataset = translation_self.load_dataset("valid") # train_dataset = translation_self.load_dataset("train") valid_dataloader = get_batch_iterator( valid_dataset, input_shapes=args.input_shapes, max_tokens=args.max_tokens, max_positions=utils.resolve_max_positions( translation_self.max_positions(), (args.max_source_positions, args.max_target_positions))) # train_dataloader = get_batch_iterator( # train_dataset, # input_shapes=args.input_shapes, # max_tokens=args.max_tokens, # max_positions=utils.resolve_max_positions( # translation_self.max_positions(), # (args.max_source_positions, args.max_target_positions))) _MODEL = None def create_model(args): global _MODEL _MODEL = Transformer_nonautoregressive_gan.build_model(
def main(parsed_args, **unused_kwargs): assert parsed_args.path is not None, '--path required for evaluation!' if torch.cuda.is_available() and not parsed_args.cpu: torch.cuda.set_device(parsed_args.device_id) utils.import_user_module(parsed_args) logger.info(parsed_args) use_cuda = torch.cuda.is_available() and not parsed_args.cpu task = tasks.setup_task(parsed_args) # Load ensemble logger.info('loading model(s) from {}'.format(parsed_args.path)) models, args = checkpoint_utils.load_model_ensemble( parsed_args.path.split(os.pathsep), arg_overrides=eval(parsed_args.model_overrides), task=task, suffix=getattr(parsed_args, "checkpoint_suffix", ""), ) for arg in vars(parsed_args).keys(): if arg not in { 'self_target', 'future_target', 'past_target', 'tokens_per_sample', 'output_size_dictionary', 'add_bos_token', }: setattr(args, arg, getattr(parsed_args, arg)) # reduce tokens per sample by the required context window size args.tokens_per_sample -= args.context_window task = tasks.setup_task(args) # Load dataset splits task.load_dataset(args.gen_subset) dataset = task.dataset(args.gen_subset) if args.context_window > 0: dataset = LMContextWindowDataset( dataset=dataset, tokens_per_sample=args.tokens_per_sample, context_window=args.context_window, pad_idx=task.source_dictionary.pad(), ) logger.info('{} {} {} examples'.format(args.data, args.gen_subset, len(dataset))) # Optimize ensemble for generation and set the source and dest dicts on the model (required by scorer) for model in models: model.make_generation_fast_() if args.fp16: model.half() if use_cuda: model.cuda() assert len(models) > 0 logger.info('num. model params: {}'.format( sum(p.numel() for p in models[0].parameters()))) itr = task.get_batch_iterator( dataset=dataset, max_tokens=args.max_tokens or 36000, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( *[model.max_positions() for model in models]), ignore_invalid_inputs=True, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args.log_format, log_interval=args.log_interval, default_log_format=('tqdm' if not args.no_progress_bar else 'none'), ) gen_timer = StopwatchMeter() scorer = SequenceScorer(task.target_dictionary, args.softmax_batch) score_sum = 0. count = 0 if args.remove_bpe is not None: if args.remove_bpe == 'sentencepiece': raise NotImplementedError else: bpe_cont = args.remove_bpe.rstrip() bpe_toks = { i for i in range(len(task.source_dictionary)) if task.source_dictionary[i].endswith(bpe_cont) } bpe_len = len(bpe_cont) else: bpe_toks = None bpe_len = 0 word_stats = dict() wps_meter = TimeMeter() for sample in progress: if 'net_input' not in sample: continue sample = utils.move_to_cuda(sample) if use_cuda else sample gen_timer.start() hypos = scorer.generate(models, sample) gen_timer.stop(sample['ntokens']) for i, hypos_i in enumerate(hypos): hypo = hypos_i[0] sample_id = sample['id'][i] tokens = hypo['tokens'] tgt_len = tokens.numel() pos_scores = hypo['positional_scores'].float() if args.add_bos_token: assert hypo['tokens'][0].item() == task.target_dictionary.bos() tokens = tokens[1:] pos_scores = pos_scores[1:] skipped_toks = 0 if bpe_toks is not None: for i in range(tgt_len - 1): if tokens[i].item() in bpe_toks: skipped_toks += 1 pos_scores[i + 1] += pos_scores[i] pos_scores[i] = 0 inf_scores = pos_scores.eq(float('inf')) | pos_scores.eq( float('-inf')) if inf_scores.any(): logger.info( 'skipping tokens with inf scores:', task.target_dictionary.string( tokens[inf_scores.nonzero()])) pos_scores = pos_scores[(~inf_scores).nonzero()] score_sum += pos_scores.sum().cpu() count += pos_scores.numel() - skipped_toks if args.output_word_probs or args.output_word_stats: w = '' word_prob = [] is_bpe = False for i in range(len(tokens)): w_ind = tokens[i].item() w += task.source_dictionary[w_ind] if bpe_toks is not None and w_ind in bpe_toks: w = w[:-bpe_len] is_bpe = True else: word_prob.append((w, pos_scores[i].item())) next_prob = None ind = i + 1 while ind < len(tokens): if pos_scores[ind].item() != 0: next_prob = pos_scores[ind] break ind += 1 word_stats.setdefault(w, WordStat(w, is_bpe)).add( pos_scores[i].item(), next_prob) is_bpe = False w = '' if args.output_word_probs: logger.info( str(int(sample_id)) + " " + ('\t'.join('{} [{:2f}]'.format(x[0], x[1]) for x in word_prob))) wps_meter.update(sample['ntokens']) progress.log({'wps': round(wps_meter.avg)}) avg_nll_loss = -score_sum / count / math.log(2) # convert to base 2 logger.info('Evaluated {} tokens in {:.1f}s ({:.2f} tokens/s)'.format( gen_timer.n, gen_timer.sum, 1. / gen_timer.avg)) logger.info('Loss (base 2): {:.4f}, Perplexity: {:.2f}'.format( avg_nll_loss, 2**avg_nll_loss)) if args.output_word_stats: for ws in sorted(word_stats.values(), key=lambda x: x.count, reverse=True): logger.info(ws)
def validate_iw(args, trainer, task, epoch_itr, subsets, prune=-1, mode='iw'): """Evaluate the model on the validation set(s) and return the losses.""" if mode == 'none' or mode == 'time' or args.criterior == 'lm_baseline': return [0] # top k instead of sampling to approximate sum of prototypes for evaluation for subset in subsets: task.dataset(subset).set_sampling(False) if args.fixed_validation_seed is not None: # set fixed seed for every validation utils.set_torch_seed(args.fixed_validation_seed) valid_losses = [] for subset in subsets: # Initialize data iterator itr = task.get_batch_iterator( dataset=task.dataset(subset), max_tokens=args.max_tokens_valid, max_sentences=args.max_sentences_valid, max_positions=utils.resolve_max_positions( task.max_positions(), trainer.get_model().max_positions(), ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args.log_format, log_interval=args.log_interval, epoch=1, prefix=f"valid on '{subset}' subset", tensorboard_logdir=(args.tensorboard_logdir if distributed_utils.is_master(args) else None), default_log_format=('tqdm' if not args.no_progress_bar else 'simple'), ) if prune > 0: index_map = trainer.get_model().set_prune_index(prune) task.set_index_map(index_map) # create a new root metrics aggregator so validation metrics # don't pollute other aggregators (e.g., train meters) with metrics.aggregate(new_root=True) as agg: for sample in progress: trainer.valid_iw_step(sample, mode=mode) # log validation stats stats = get_valid_stats(args, trainer, agg.get_smoothed_values()) progress.print(stats, tag='valid_iw', step=trainer.get_num_updates()) # valid_losses.append(stats[args.best_checkpoint_metric]) if prune > 0: trainer.get_model().reset_prune_index() task.reset_index_map() return valid_losses
def _main(args, output_file): logging.basicConfig( format='%(asctime)s | %(levelname)s | %(name)s | %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, stream=output_file, ) logger = logging.getLogger('fairseq_cli.generate') utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 logger.info(args) # Fix seed for stochastic decoding if args.seed is not None and not args.no_seed_provided: np.random.seed(args.seed) utils.set_torch_seed(args.seed) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble logger.info('loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args.path), arg_overrides=eval(args.model_overrides), task=task, suffix=getattr(args, "checkpoint_suffix", ""), ) # Optimize ensemble for generation for model in models: model.prepare_for_inference_(args) if args.fp16: model.half() if use_cuda: model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args.log_format, log_interval=args.log_interval, default_log_format=('tqdm' if not args.no_progress_bar else 'none'), ) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(models, args) # Handle tokenization and BPE tokenizer = encoders.build_tokenizer(args) bpe = encoders.build_bpe(args) def decode_fn(x): if bpe is not None: x = bpe.decode(x) if tokenizer is not None: x = tokenizer.decode(x) return x scorer = scoring.build_scorer(args, tgt_dict) num_sentences = 0 has_target = True wps_meter = TimeMeter() for sample in progress: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding if 'src_tokens' in sample['net_input']: src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) else: src_tokens = None target_tokens = None if has_target: target_tokens = utils.strip_pad(sample['target'][i, :], tgt_dict.pad()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( args.gen_subset).tgt.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) else: src_str = "" if has_target: target_str = tgt_dict.string( target_tokens, args.remove_bpe, escape_unk=True, extra_symbols_to_ignore= get_symbols_to_strip_from_output(generator), ) src_str = decode_fn(src_str) if has_target: target_str = decode_fn(target_str) if not args.quiet: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str), file=output_file) if has_target: print('T-{}\t{}'.format(sample_id, target_str), file=output_file) # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, extra_symbols_to_ignore=get_symbols_to_strip_from_output( generator), ) detok_hypo_str = decode_fn(hypo_str) if not args.quiet: score = hypo['score'] / math.log(2) # convert to base 2 # original hypothesis (after tokenization and BPE) print('H-{}\t{}\t{}'.format(sample_id, score, hypo_str), file=output_file) # detokenized hypothesis print('D-{}\t{}\t{}'.format(sample_id, score, detok_hypo_str), file=output_file) print( 'P-{}\t{}'.format( sample_id, ' '.join( map( lambda x: '{:.4f}'.format(x), # convert from base e to base 2 hypo['positional_scores'].div_(math.log(2) ).tolist(), ))), file=output_file) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join([ '{}-{}'.format(src_idx, tgt_idx) for src_idx, tgt_idx in alignment ])), file=output_file) if args.print_step: print('I-{}\t{}'.format(sample_id, hypo['steps']), file=output_file) if getattr(args, 'retain_iter_history', False): for step, h in enumerate(hypo['history']): _, h_str, _ = utils.post_process_prediction( hypo_tokens=h['tokens'].int().cpu(), src_str=src_str, alignment=None, align_dict=None, tgt_dict=tgt_dict, remove_bpe=None, ) print('E-{}_{}\t{}'.format(sample_id, step, h_str), file=output_file) # Score only the top hypothesis if has_target and j == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True) hypo_tokens = tgt_dict.encode_line( detok_hypo_str, add_if_not_exist=True) if hasattr(scorer, 'add_string'): scorer.add_string(target_str, detok_hypo_str) else: scorer.add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) progress.log({'wps': round(wps_meter.avg)}) num_sentences += sample[ "nsentences"] if "nsentences" in sample else sample['id'].numel() logger.info('NOTE: hypothesis and token scores are output in base 2') logger.info( 'Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: if args.bpe and not args.sacrebleu: if args.remove_bpe: logger.warning( "BLEU score is being computed by splitting detokenized string on spaces, this is probably not what you want. Use --sacrebleu for standard 13a BLEU tokenization" ) else: logger.warning( "If you are using BPE on the target side, the BLEU score is computed on BPE tokens, not on proper words. Use --sacrebleu for standard 13a BLEU tokenization" ) # use print to be consistent with other main outputs: S-, H-, T-, D- and so on print('Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string()), file=output_file) return scorer
def validate(args, trainer, task, epoch_itr, subsets, generator=None): """Evaluate the model on the validation set(s) and return the losses.""" valid_losses = [] if args.eval_bleu: bleus = validate_translation(args, trainer, task, epoch_itr, generator) for subset in subsets: # Initialize data iterator itr = task.get_batch_iterator( dataset=task.dataset(subset), max_tokens=args.max_tokens_valid, max_sentences=args.max_sentences_valid, max_positions=utils.resolve_max_positions( task.max_positions(), trainer.get_model().max_positions(), ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, noskip=True, )[0].next_epoch_itr(shuffle=False) progress = progress_bar.build_progress_bar( args, itr, epoch_itr.epoch, prefix='valid on \'{}\' subset'.format(subset), no_progress_bar='simple') # reset validation loss meters for k in ['valid_loss', 'valid_nll_loss']: meter = trainer.get_meter(k) if meter is not None: meter.reset() extra_meters = collections.defaultdict(lambda: AverageMeter()) if args.eval_bleu: for k, v in bleus.items(): extra_meters[k + ":bleu"].update(v) for sample in progress: log_output = trainer.valid_step(sample) for k, v in log_output.items(): if k in [ 'loss', 'nll_loss', 'ntokens', 'nsentences', 'sample_size' ]: continue extra_meters[k].update(v) # log validation stats stats = get_valid_stats(trainer, args, extra_meters) if epoch_itr.epoch > args.switch_obj_epoch: for k, v in extra_meters.items(): #print(k, v.avg) if k.endswith(":loss"): k = k.split(":")[0] trainer.valid_losses[k] = v.avg for k, meter in extra_meters.items(): stats[k] = meter.avg progress.print(stats, tag=subset, step=trainer.get_num_updates()) valid_losses.append(stats[args.best_checkpoint_metric].avg if args. best_checkpoint_metric == 'loss' else stats[args.best_checkpoint_metric]) if args.eval_bleu: return [sum(bleus.values())] else: return valid_losses
def main(args): assert args.path is not None, '--path required for generation!' assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert args.replace_unk is None or args.raw_text, \ '--replace-unk requires a raw text dataset (--raw-text)' if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) print('| {} {} {} examples'.format(args.data, args.gen_subset, len(task.dataset(args.gen_subset)))) # Set dictionaries src_dict = task.source_dictionary tgt_dict = task.target_dictionary # Load ensemble print('| loading model(s) from {}'.format(args.path)) models, _ = utils.load_ensemble_for_inference(args.path.split(':'), task, model_arg_overrides=eval(args.model_overrides)) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models] ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=8, num_shards=args.num_shards, shard_id=args.shard_id, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() if args.score_reference: translator = SequenceScorer(models, task.target_dictionary) else: translator = SequenceGenerator( models, task.target_dictionary, beam_size=args.beam, minlen=args.min_len, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen, unk_penalty=args.unkpen, sampling=args.sampling, sampling_topk=args.sampling_topk, sampling_temperature=args.sampling_temperature, diverse_beam_groups=args.diverse_beam_groups, diverse_beam_strength=args.diverse_beam_strength, ) if use_cuda: translator.cuda() # Generate and compute BLEU score scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) num_sentences = 0 has_target = True with progress_bar.build_progress_bar(args, itr) as t: if args.score_reference: translations = translator.score_batched_itr(t, cuda=use_cuda, timer=gen_timer) else: translations = translator.generate_batched_itr( t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda=use_cuda, timer=gen_timer, prefix_size=args.prefix_size, ) wps_meter = TimeMeter() for sample_id, src_tokens, target_tokens, hypos in translations: # Process input and ground truth has_target = target_tokens is not None target_tokens = target_tokens.int().cpu() if has_target else None # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset(args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset(args.gen_subset).tgt.get_original_text(sample_id) else: src_str = src_dict.string(src_tokens, args.remove_bpe) if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) if not args.quiet: print('S-{}\t{}'.format(sample_id, src_str)) if has_target: print('T-{}\t{}'.format(sample_id, target_str)) # Process top predictions for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) if not args.quiet: print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], hypo_str)) print('P-{}\t{}'.format( sample_id, ' '.join(map( lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist(), )) )) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join(map(lambda x: str(utils.item(x)), alignment)) )) # Score only the top hypothesis if has_target and i == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tokenizer.Tokenizer.tokenize( target_str, tgt_dict, add_if_not_exist=True) scorer.add(target_tokens, hypo_tokens) wps_meter.update(src_tokens.size(0)) t.log({'wps': round(wps_meter.avg)}) num_sentences += 1 print('| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'.format( num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string()))
def build_translator(args): if args.max_tokens is None and args.max_sentences is None: args.max_sentences = 1 assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert not args.max_sentences or args.max_sentences <= args.buffer_size, \ '--max-sentences/--batch-size cannot be larger than --buffer-size' #print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Setup task, e.g., translation task = tasks.setup_task(args) # Load ensemble print('| loading model(s) from {}'.format(args.path)) model_paths = args.path.split(':') models, model_args = utils.load_ensemble_for_inference( model_paths, task, model_arg_overrides=eval(args.model_overrides)) # Set dictionaries tgt_dict = task.target_dictionary # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() # Initialize generator translator = SequenceGenerator( models, tgt_dict, beam_size=args.beam, minlen=args.min_len, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen, unk_penalty=args.unkpen, sampling=args.sampling, sampling_topk=args.sampling_topk, sampling_temperature=args.sampling_temperature, diverse_beam_groups=args.diverse_beam_groups, diverse_beam_strength=args.diverse_beam_strength, ) if use_cuda: translator.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) def make_result(src_str, hypos): result = Translation( src_str='O\t{}'.format(src_str), hypos=[], pos_scores=[], alignments=[], ) # Process top predictions for hypo in hypos[:min(len(hypos), args.nbest)]: hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) result.hypos.append('H\t{}\t{}'.format(hypo['score'], hypo_str)) result.pos_scores.append('P\t{}'.format(' '.join( map( lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist(), )))) result.alignments.append('A\t{}'.format(' '.join( map(lambda x: str(utils.item(x)), alignment))) if args. print_alignment else None) return result def process_batch(batch): tokens = batch.tokens lengths = batch.lengths if use_cuda: tokens = tokens.cuda() lengths = lengths.cuda() encoder_input = {'src_tokens': tokens, 'src_lengths': lengths} translations = translator.generate( encoder_input, maxlen=int(args.max_len_a * tokens.size(1) + args.max_len_b), ) return [ make_result(batch.srcs[i], t) for i, t in enumerate(translations) ] max_positions = utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]) # if args.buffer_size > 1: # print('| Sentence buffer size:', args.buffer_size) # print('| Type the input sentence and press return:') def translate(inputs): indices = [] results = [] ordered_results = [] for batch, batch_indices in make_batches(inputs, args, task, max_positions): indices.extend(batch_indices) results += process_batch(batch) for i in np.argsort(indices): result = results[i] ordered_results.append(result) # print(result.src_str) # for hypo, pos_scores, align in zip(result.hypos, result.pos_scores, result.alignments): # print(hypo) # print(pos_scores) # if align is not None: # print(align) return ordered_results return translate
def validate(args, trainer, task, epoch_itr, subsets, force_refine_step=None): """Evaluate the model on the validation set(s) and return the losses.""" valid_random = np.random.RandomState(3) valid_task_random = np.random.RandomState(3) if not hasattr(task, "random"): task.random = None task_random_bak = task.random task.random = valid_task_random valid_losses = [] for subset in subsets: # Initialize data iterator dataset = task.dataset(subset) if hasattr(dataset, "random"): random_bak = dataset.random else: random_bak = None dataset.random = valid_random itr = task.get_batch_iterator( dataset=dataset, max_tokens=args.max_tokens_valid, max_sentences=args.max_sentences_valid, max_positions=utils.resolve_max_positions( task.max_positions(), trainer.get_model().max_positions(), ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) progress = progress_bar.build_progress_bar( args, itr, epoch_itr.epoch, prefix='valid on \'{}\' subset'.format(subset), no_progress_bar='simple') # reset validation loss meters for k in ['valid_loss', 'valid_nll_loss']: meter = trainer.get_meter(k) if meter is not None: meter.reset() extra_meters = collections.defaultdict(lambda: AverageMeter()) if hasattr(args, "progressive") and args.progressive: dataset.set_random_refine_step(args.refinetot, force_refine_step=force_refine_step) for sample in progress: if sample is None or len(sample) == 0: sys.stderr.write("empty valid sample detected\n") sys.stderr.flush() break log_output = trainer.valid_step(sample) for k, v in log_output.items(): if k in [ 'loss', 'nll_loss', 'ntokens', 'nsentences', 'sample_size' ]: continue extra_meters[k].update(v) if hasattr(args, "progressive") and args.progressive: dataset.set_random_refine_step( args.refinetot, force_refine_step=force_refine_step) # log validation stats stats = get_valid_stats(trainer) for k, meter in extra_meters.items(): stats[k] = meter.avg progress.print(stats, tag=subset, step=trainer.get_num_updates()) valid_losses.append(stats[args.best_checkpoint_metric] if type(stats[args.best_checkpoint_metric]) == float else stats[args.best_checkpoint_metric].avg) dataset.random = random_bak if HAS_WANDB and distributed_utils.is_master(args): stat_dict = {} for k, v in stats.items(): if isinstance(v, AverageMeter): stat_dict[k] = v.val else: stat_dict[k] = v wandb.log(stat_dict, step=trainer.get_num_updates()) task.random = task_random_bak return valid_losses
def _main(args, output_file): logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, stream=output_file, ) logger = logging.getLogger("espresso.dump_posteriors") print_options_meaning_changes(args, logger) utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 logger.info(args) # Fix seed for stochastic decoding if args.seed is not None and not args.no_seed_provided: np.random.seed(args.seed) utils.set_torch_seed(args.seed) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset split task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Load ensemble logger.info("loading model(s) from {}".format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args.path), arg_overrides=eval(args.model_overrides), task=task, suffix=getattr(args, "checkpoint_suffix", ""), ) # Load state prior for cross-entropy trained systems decoding if args.state_prior_file is not None: prior = torch.from_numpy(kaldi_io.read_vec_flt(args.state_prior_file)) else: prior = [] # Optimize ensemble for generation for model in models: model.prepare_for_inference_(args) if args.fp16: model.half() if use_cuda: model.cuda() if isinstance(prior, list) and getattr(model, "state_prior", None) is not None: prior.append(model.state_prior.unsqueeze(0)) if isinstance(prior, list) and len(prior) > 0: prior = torch.cat(prior, 0).mean(0) # average priors across models prior = prior / prior.sum() # re-normalize elif isinstance(prior, list): prior = None if prior is not None: if args.fp16: prior = prior.half() if use_cuda: prior = prior.cuda() log_prior = prior.log() else: log_prior = None # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[ model.max_positions() if hasattr(model, "encoder") else (None, model.max_positions()) for model in models ]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args.log_format, log_interval=args.log_interval, default_log_format=("tqdm" if not args.no_progress_bar else "none"), ) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(models, args) # Generate and dump num_sentences = 0 chunk_width = getattr(task, "chunk_width", None) lprobs_wspecifier = "ark:| copy-matrix ark:- ark:-" with kaldi_io.open_or_fd(lprobs_wspecifier, "wb") as f: if chunk_width is None: # normal dumping (i.e., no chunking) for sample in progress: sample = utils.move_to_cuda(sample) if use_cuda else sample if "net_input" not in sample: continue gen_timer.start() lprobs, padding_mask = task.inference_step( generator, models, sample) if log_prior is not None: assert lprobs.size(-1) == log_prior.size(0) lprobs = lprobs - log_prior out_lengths = (~padding_mask).long().sum( dim=1).cpu() if padding_mask is not None else None num_processed_frames = sample["ntokens"] gen_timer.stop(num_processed_frames) num_sentences += sample[ "nsentences"] if "nsentences" in sample else sample[ 'id'].numel() if out_lengths is not None: for i in range(sample["nsentences"]): length = out_lengths[i] kaldi_io.write_mat(f, lprobs[i, :length, :].cpu().numpy(), key=sample["utt_id"][i]) else: for i in range(sample["nsentences"]): kaldi_io.write_mat(f, lprobs[i, :, :].cpu().numpy(), key=sample["utt_id"][i]) else: # dumping chunks within the same utterance from left to right for sample in progress: # sample is actually a list of batches sample = utils.move_to_cuda(sample) if use_cuda else sample utt_id = sample[0]["utt_id"] id = sample[0]["id"] whole_lprobs = None for i, chunk_sample in enumerate(sample): if "net_input" not in chunk_sample: continue assert chunk_sample["utt_id"] == utt_id and ( chunk_sample["id"] == id).all() gen_timer.start() lprobs, _ = task.inference_step(generator, models, chunk_sample) if log_prior is not None: assert lprobs.size(-1) == log_prior.size(0) lprobs = lprobs - log_prior if whole_lprobs is None: whole_lprobs = lprobs.cpu() else: whole_lprobs = torch.cat((whole_lprobs, lprobs.cpu()), 1) num_processed_frames = chunk_sample["ntokens"] gen_timer.stop(num_processed_frames) if i == len(sample) - 1: num_sentences += len(utt_id) for j in range(len(utt_id)): truncated_length = models[0].output_lengths( task.dataset(args.gen_subset).src_sizes[id[j]] ) # length is after possible subsampling by the model mat = whole_lprobs[j, :truncated_length, :] kaldi_io.write_mat(f, mat.numpy(), key=utt_id[j]) logger.info( "Dumped {} utterances ({} frames) in {:.1f}s ({:.2f} sentences/s, {:.2f} frames/s)" .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) return
def main(args): ckpt = torch.load(args.checkpoint) ckpt['args'].path = args.checkpoint task = tasks.setup_task(ckpt['args']) models, _model_args = checkpoint_utils.load_model_ensemble( ckpt['args'].path.split(os.pathsep), arg_overrides=eval("{}"), task=task, ) model = models[0] src_dict = task.source_dictionary tgt_dict = task.target_dictionary tokenizer = encoders.build_tokenizer(ckpt['args']) bpe = encoders.build_bpe(ckpt['args']) def encode_fn(x): if tokenizer is not None: x = tokenizer.encode(x) if bpe is not None: x = bpe.encode(x) return x max_positions = utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models] ) start_id = 0 outputs = { "id": [], "enc_ctrl_feat": [], "dec_ctrl_feat": [], } ctrl_enc = getattr(model.encoder, 'module_ctrl', None) ctrl_dec = getattr(model.decoder, 'module_ctrl', None) if ctrl_enc is not None: outputs['enc_ctrl_pred'] = [] if ctrl_dec is not None: outputs['dec_ctrl_pred'] = [] for inputs in buffered_read(args.input_file, 100): results = [] with torch.no_grad(): for batch in make_batches(inputs, ckpt['args'], task, max_positions, encode_fn): src_tokens = batch.src_tokens src_lengths = batch.src_lengths #if use_cuda: # src_tokens = src_tokens.cuda() # src_lengths = src_lengths.cuda() bsz = src_tokens.size(0) mask = src_tokens.eq( model.encoder.padding_idx).view(bsz, -1, 1) emb_out, _ = model.encoder.forward_embedding(src_tokens) if ctrl_enc is not None: enc_ctrl_feat = ctrl_enc.extract_features( emb_out.clone(), padding_mask=mask) enc_ctrl_out = ctrl_enc( emb_out.clone(), 'validation', padding_mask=mask) enc_out = model.encoder.forward(src_tokens, src_lengths) enc_out = enc_out.encoder_out.transpose(0, 1) if ctrl_dec is not None: dec_ctrl_feat = ctrl_dec.extract_features( enc_out.clone(), padding_mask=mask) dec_ctrl_out = ctrl_dec( enc_out.clone(), 'validation', padding_mask=mask ) for i, idx in enumerate(batch.ids.tolist()): outputs['id'].append(idx) outputs['enc_ctrl_feat'].append(enc_ctrl_feat[i].numpy()) outputs['dec_ctrl_feat'].append(dec_ctrl_feat[i].numpy()) if ctrl_enc is not None: outputs['enc_ctrl_pred'].append( enc_ctrl_out.ctrl_prediction[i].numpy()) if ctrl_dec is not None: outputs['dec_ctrl_pred'].append( dec_ctrl_out.ctrl_prediction[i].numpy()) for k in outputs: outputs[k] = np.stack(outputs[k], 0) if ctrl_enc is not None: outputs['enc_ctrl_proj'] = ctrl_enc.out_proj.weight.detach().numpy() if ctrl_dec is not None: outputs['dec_ctrl_proj'] = ctrl_dec.out_proj.weight.detach().numpy() np.savez(args.output_file, **outputs)
def main(args): if args.max_tokens is None: args.max_tokens = 6000 print(args) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(task, ['train', 'valid']) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format(sum(p.numel() for p in model.parameters()))) # Make a dummy batch to (i) warm the caching allocator and (ii) as a # placeholder DistributedDataParallel when there's an uneven number of # batches per worker. max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) dummy_batch = task.dataset('train').get_dummy_batch(args.max_tokens, max_positions) # Build trainer trainer = Trainer(args, task, model, criterion, dummy_batch) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Initialize dataloader epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ) # Load the latest checkpoint if one is available if not load_checkpoint(args, trainer, epoch_itr): trainer.dummy_train_step([dummy_batch]) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates() < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def eval_dataset(task, model, dataset, out_file, thresholds, compute_metrics, use_cuda=True): itr = task.get_batch_iterator( dataset=dataset, max_tokens=args.max_tokens or 4096, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions(model.max_positions()), num_shards=args.num_shards, shard_id=args.shard_id, ignore_invalid_inputs=False, ).next_epoch_itr(shuffle=False) model.eval() if thresholds is None or len(thresholds) == 0: thresholds = [0.] predictions = {} meters = {t: ClassificationMeter() for t in thresholds} with torch.no_grad(), progress_bar.build_progress_bar(args, itr) as t: for batch in t: if use_cuda: batch = utils.move_to_cuda(batch) logits = model(**batch['net_input']) assert logits.shape[-1] == 2 normed = logits.softmax(dim=-1) diff = normed[:, :, 0] - normed[:, :, 1] pred_class = {t: (diff < t) for t in thresholds} for i in range(len(batch['id'])): pred = pred_class[thresholds[0]][i].item() predictions[batch['id'][i].item()] = pred if compute_metrics: preds = {t: pred_class[t][i].item() for t in thresholds} if len(thresholds) > 0 else { 0.: pred_class[i].item() } actual = batch['target'][i].item() assert actual in {0, 1}, 'bad actual! ' + str(actual) for t, p in preds.items(): assert p in {0, 1}, 'bad p! ' + str(p) tp = int(actual == 1 and p == 1) fp = int(actual == 0 and p == 1) tn = int(actual == 0 and p == 0) fn = int(actual == 1 and p == 0) assert tp + fp + tn + fn == 1 meters[t].update(tp, tn, fp, fn) if compute_metrics: for t, m in meters.items(): print('{}: {}'.format(t, m.vals())) if out_file is not None: with open(out_file, 'w') as out_f: print('index\tprediction', file=out_f) for i in range(len(predictions)): print(f'{i}\t{predictions[i]}', file=out_f)
def _main(args, output_file): logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=os.environ.get("LOGLEVEL", "INFO").upper(), stream=output_file, ) logger = logging.getLogger("espresso.speech_recognize") if output_file is not sys.stdout: # also print to stdout logger.addHandler(logging.StreamHandler(sys.stdout)) print_options_meaning_changes(args, logger) utils.import_user_module(args) if args.max_tokens is None and args.batch_size is None: args.max_tokens = 12000 logger.info(args) # Fix seed for stochastic decoding if args.seed is not None and not args.no_seed_provided: np.random.seed(args.seed) utils.set_torch_seed(args.seed) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset split task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionary dictionary = task.target_dictionary overrides = ast.literal_eval(args.model_overrides) # Load ensemble logger.info("loading model(s) from {}".format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args.path), arg_overrides=overrides, task=task, suffix=getattr(args, "checkpoint_suffix", ""), strict=(args.checkpoint_shard_count == 1), num_shards=args.checkpoint_shard_count, ) if args.lm_path is not None: overrides["data"] = args.data try: lms, _ = checkpoint_utils.load_model_ensemble( utils.split_paths(args.lm_path), arg_overrides=overrides, task=None, ) except: logger.warning( f"Failed to load language model! Please make sure that the language model dict is the same " f"as target dict and is located in the data dir ({args.data})") raise assert len(lms) == 1 or len(lms) == 2 # Multi-level LM expects two LMs else: lms = [None] for i, m in enumerate(lms): if m is None: continue if hasattr(m, "is_wordlm") and m.is_wordlm: # assume subword LM comes before word LM if i > 0 and isinstance(lms[i - 1], FairseqLanguageModel): lms[i - 1] = MultiLevelLanguageModel( m, lms[i - 1], subwordlm_weight=args.subwordlm_weight, oov_penalty=args.oov_penalty, open_vocab=not args.disable_open_vocab, ) del lms[i] logger.info("LM fusion with Multi-level LM") else: lms[i] = TensorizedLookaheadLanguageModel( m, dictionary, oov_penalty=args.oov_penalty, open_vocab=not args.disable_open_vocab, ) logger.info("LM fusion with Look-ahead Word LM") else: assert isinstance(m, FairseqLanguageModel) logger.info("LM fusion with Subword LM") if args.lm_weight != 0.0: logger.info("using LM fusion with lm-weight={:.2f}".format( args.lm_weight)) # Optimize ensemble for generation for model in chain(models, lms): if model is None: continue if args.fp16: model.half() if use_cuda and not args.pipeline_model_parallel: model.cuda() model.prepare_for_inference_(args) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.batch_size, max_positions=utils.resolve_max_positions( task.max_positions(), *[ model.max_positions() if hasattr(model, "encoder") else (None, model.max_positions()) for model in models ]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, data_buffer_size=args.data_buffer_size, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args.log_format, log_interval=args.log_interval, default_log_format=("tqdm" if not args.no_progress_bar else "none"), ) # Initialize generator if args.match_source_len: logger.warning( "The option match_source_len is not applicable to speech recognition. Ignoring it." ) gen_timer = StopwatchMeter() extra_gen_cls_kwargs = { "lm_model": lms[0], "lm_weight": args.lm_weight, "eos_factor": args.eos_factor, } args.score_reference = False # not applicable for ASR temp_val = args.print_alignment args.print_alignment = False # not applicable for ASR generator = task.build_generator(models, args, extra_gen_cls_kwargs=extra_gen_cls_kwargs) args.print_alignment = temp_val # Handle tokenization and BPE tokenizer = task.build_tokenizer(args) bpe = task.build_bpe(args) def decode_fn(x): if bpe is not None: x = bpe.decode(x) if tokenizer is not None: x = tokenizer.decode(x) return x # Generate and compute WER scorer = wer.Scorer(dictionary, wer_output_filter=args.wer_output_filter) num_sentences = 0 has_target = True wps_meter = TimeMeter() for sample in progress: sample = utils.move_to_cuda(sample) if use_cuda else sample if "net_input" not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample["target"][:, :args.prefix_size] constraints = None if "constraints" in sample: constraints = sample["constraints"] gen_timer.start() hypos = task.inference_step(generator, models, sample, prefix_tokens=prefix_tokens, constraints=constraints) num_generated_tokens = sum(len(h[0]["tokens"]) for h in hypos) gen_timer.stop(num_generated_tokens) # obtain nonpad mask of encoder output to plot attentions if args.print_alignment: net_input = sample["net_input"] src_tokens = net_input["src_tokens"] output_lengths = models[0].encoder.output_lengths( net_input["src_lengths"]) nonpad_idxs = sequence_mask( output_lengths, models[0].encoder.output_lengths(src_tokens.size(1))) for i in range(len(sample["id"])): has_target = sample["target"] is not None utt_id = sample["utt_id"][i] # Retrieve the original sentences if has_target: target_str = sample["target_raw_text"][i] if not args.quiet: detok_target_str = decode_fn(target_str) print("T-{}\t{}".format(utt_id, detok_target_str), file=output_file) # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_str = dictionary.string( hypo["tokens"].int().cpu(), bpe_symbol=None, extra_symbols_to_ignore=get_symbols_to_strip_from_output( generator), ) # not removing bpe at this point detok_hypo_str = decode_fn(hypo_str) if not args.quiet: score = hypo["score"] / math.log(2) # convert to base 2 print("H-{}\t{}\t{}".format(utt_id, detok_hypo_str, score), file=output_file) # Score and obtain attention only the top hypothesis if j == 0: # src_len x tgt_len attention = hypo["attention"][nonpad_idxs[i]].float().cpu() \ if args.print_alignment and hypo["attention"] is not None else None if args.print_alignment and attention is not None: save_dir = os.path.join(args.results_path, "attn_plots") os.makedirs(save_dir, exist_ok=True) plot_attention(attention, detok_hypo_str, utt_id, save_dir) scorer.add_prediction(utt_id, hypo_str) if has_target: scorer.add_evaluation(utt_id, target_str, hypo_str) wps_meter.update(num_generated_tokens) progress.log({"wps": round(wps_meter.avg)}) num_sentences += sample[ "nsentences"] if "nsentences" in sample else sample["id"].numel() logger.info("NOTE: hypothesis and token scores are output in base 2") logger.info( "Recognized {} utterances ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)" .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if args.print_alignment: logger.info("Saved attention plots in " + save_dir) if has_target: scorer.add_ordered_utt_list(task.datasets[args.gen_subset].tgt.utt_ids) fn = "decoded_char_results.txt" with open(os.path.join(args.results_path, fn), "w", encoding="utf-8") as f: f.write(scorer.print_char_results()) logger.info("Decoded char results saved as " + f.name) fn = "decoded_results.txt" with open(os.path.join(args.results_path, fn), "w", encoding="utf-8") as f: f.write(scorer.print_results()) logger.info("Decoded results saved as " + f.name) if has_target: header = "Recognize {} with beam={}: ".format(args.gen_subset, args.beam) fn = "wer" with open(os.path.join(args.results_path, fn), "w", encoding="utf-8") as f: res = "WER={:.2f}%, Sub={:.2f}%, Ins={:.2f}%, Del={:.2f}%".format( *(scorer.wer())) logger.info(header + res) f.write(res + "\n") logger.info("WER saved in " + f.name) fn = "cer" with open(os.path.join(args.results_path, fn), "w", encoding="utf-8") as f: res = "CER={:.2f}%, Sub={:.2f}%, Ins={:.2f}%, Del={:.2f}%".format( *(scorer.cer())) logger.info(" " * len(header) + res) f.write(res + "\n") logger.info("CER saved in " + f.name) fn = "aligned_results.txt" with open(os.path.join(args.results_path, fn), "w", encoding="utf-8") as f: f.write(scorer.print_aligned_results()) logger.info("Aligned results saved as " + f.name) return scorer
def main(args): assert args.path is not None, '--path required for generation!' assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert args.replace_unk is None or args.raw_text, \ '--replace-unk requires a raw text dataset (--raw-text)' args.beam=1 utils.import_user_module(args) if args.max_tokens is None: args.max_tokens = 12000 print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble print('| loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(':'), arg_overrides=eval(args.model_overrides), task=task, ) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_positions=utils.resolve_max_positions( task.max_positions(), ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) num_sentences = 0 source_sentences = [] shard_id = 0 all_avg_pool = None encoder_has_langtok = ( hasattr(task.args, 'encoder_langtok') and task.args.encoder_langtok is not None and hasattr(task.args, 'lang_tok_replacing_bos_eos') and not task.args.lang_tok_replacing_bos_eos ) with progress_bar.build_progress_bar(args, itr) as t: for sample in t: if sample is None: print("Skipping None") continue sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] with torch.no_grad(): avg_pool = get_avg_pool( models, sample, prefix_tokens, src_dict, args.remove_bpe, has_langtok=encoder_has_langtok) if all_avg_pool is not None: all_avg_pool = np.concatenate((all_avg_pool, avg_pool)) else: all_avg_pool = avg_pool if not isinstance(sample['id'], list): sample_ids = sample['id'].tolist() else: sample_ids = sample['id'] for i, sample_id in enumerate(sample_ids): # Remove padding src_tokens = utils.strip_pad(sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset(args.gen_subset).src.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) else: src_str = "" if not args.quiet: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str)) source_sentences.append(f"{sample_id}\t{src_str}") num_sentences += sample['nsentences'] if all_avg_pool.shape[0] >= 1000000: with open(f'{args.encoder_save_dir}/all_avg_pool.{args.source_lang}.{shard_id}', 'w') as avg_pool_file: all_avg_pool.tofile(avg_pool_file) with open(f'{args.encoder_save_dir}/sentences.{args.source_lang}.{shard_id}', 'w') as sentence_file: sentence_file.writelines(f'{line}\n' for line in source_sentences) all_avg_pool = None source_sentences = [] shard_id += 1 if all_avg_pool is not None: with open(f'{args.encoder_save_dir}/all_avg_pool.{args.source_lang}.{shard_id}', 'w') as avg_pool_file: all_avg_pool.tofile(avg_pool_file) with open(f'{args.encoder_save_dir}/sentences.{args.source_lang}.{shard_id}', 'w') as sentence_file: sentence_file.writelines(f'{line}\n' for line in source_sentences) return None
def validate(args, trainer, task, epoch_itr, subsets): """Evaluate the model on the validation set(s) and return the losses.""" if args.fixed_validation_seed is not None: # set fixed seed for every validation utils.set_torch_seed(args.fixed_validation_seed) valid_losses = [] for subset in subsets: # Initialize data iterator itr = task.get_batch_iterator( dataset=task.dataset(subset), max_tokens=args.max_tokens_valid, max_sentences=args.max_sentences_valid, max_positions=utils.resolve_max_positions( task.max_positions(), trainer.get_model().max_positions(), ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) progress = progress_bar.build_progress_bar( args, itr, epoch_itr.epoch, prefix='valid on \'{}\' subset'.format(subset), no_progress_bar='simple') # reset validation loss meters for k in ['valid_loss', 'valid_nll_loss']: meter = trainer.get_meter(k) if meter is not None: meter.reset() extra_meters = collections.defaultdict(lambda: AverageMeter()) if callable(getattr(trainer.criterion, 'set_valid_tgt_dataset', None)): trainer.criterion.set_valid_tgt_dataset(task.dataset(subset).tgt) for sample in progress: log_output = trainer.valid_step(sample) for k, v in log_output.items(): if k in [ 'loss', 'nll_loss', 'ntokens', 'nsentences', 'sample_size', 'word_count', 'char_count' ]: continue if k == 'word_error': extra_meters['wer'].update( float(v) / log_output['word_count'] * 100, log_output['word_count']) elif k == 'char_error': extra_meters['cer'].update( float(v) / log_output['char_count'] * 100, log_output['char_count']) else: extra_meters[k].update(v) # log validation stats stats = get_valid_stats(trainer, args, extra_meters) for k, meter in extra_meters.items(): stats[k] = meter.avg progress.print(stats, tag=subset, step=trainer.get_num_updates()) valid_losses.append(stats[args.best_checkpoint_metric].avg if args. best_checkpoint_metric == 'loss' else stats[args.best_checkpoint_metric]) return valid_losses
def main(args, task=None, model_state=None): check_args(args) if args.max_tokens is None and args.batch_size is None: args.max_tokens = 4000000 logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu logger.info("| decoding with criterion {}".format(args.criterion)) #import cloudpickle as pickle #pickle.dump(args, open('ARGS.bin', 'wb')) #args = pickle.load(open('ARGS.bin', 'rb')) #model_state = None task = tasks.setup_task(args) # Load ensemble if args.load_emissions: models, criterions = [], [] task.load_dataset(args.gen_subset) else: logger.info("| loading model(s) from {}".format(args.path)) #models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( models, saved_cfg = checkpoint_utils.load_model_ensemble( utils.split_paths(args.path), arg_overrides=ast.literal_eval(args.model_overrides), task=task, suffix=args.checkpoint_suffix, strict=(args.checkpoint_shard_count == 1), num_shards=args.checkpoint_shard_count, state=model_state, ) optimize_models(args, use_cuda, models) task.load_dataset(args.gen_subset, task_cfg=saved_cfg.task) # Set dictionary tgt_dict = task.target_dictionary logger.info("| {} {} {} examples".format( args.data, args.gen_subset, len(task.dataset(args.gen_subset)))) # hack to pass transitions to W2lDecoder if args.criterion == "asg_loss": raise NotImplementedError("asg_loss is currently not supported") # trans = criterions[0].asg.trans.data # args.asg_transitions = torch.flatten(trans).tolist() # Load dataset (possibly sharded) itr = get_dataset_itr(args, task, models) # Initialize generator gen_timer = StopwatchMeter() def build_generator(args): w2l_decoder = getattr(args, "w2l_decoder", None) if w2l_decoder == "viterbi": from w2l_decoder import W2lViterbiDecoder return W2lViterbiDecoder(args, task.target_dictionary) elif w2l_decoder == "kenlm": from w2l_decoder import W2lKenLMDecoder return W2lKenLMDecoder(args, task.target_dictionary) elif w2l_decoder == "fairseqlm": from w2l_decoder import W2lFairseqLMDecoder return W2lFairseqLMDecoder(args, task.target_dictionary) else: print( "only flashlight decoders with (viterbi, kenlm, fairseqlm) options are supported at the moment" ) # please do not touch this unless you test both generate.py and infer.py with audio_pretraining task generator = build_generator(args) if args.load_emissions: generator = ExistingEmissionsDecoder( generator, np.load(args.load_emissions, allow_pickle=True)) logger.info("loaded emissions from " + args.load_emissions) num_sentences = 0 if args.results_path is not None and not os.path.exists(args.results_path): os.makedirs(args.results_path) max_source_pos = (utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ) if max_source_pos is not None: max_source_pos = max_source_pos[0] if max_source_pos is not None: max_source_pos = max_source_pos[0] - 1 if args.dump_emissions: emissions = {} if args.dump_features: features = {} models[0].bert.proj = None else: res_files = prepare_result_files(args) errs_t = 0 lengths_t = 0 with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if "net_input" not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample["target"][:, :args.prefix_size] gen_timer.start() if args.dump_emissions: with torch.no_grad(): encoder_out = models[0](**sample["net_input"]) emm = models[0].get_normalized_probs(encoder_out, log_probs=True) emm = emm.transpose(0, 1).cpu().numpy() for i, id in enumerate(sample["id"]): emissions[id.item()] = emm[i] continue elif args.dump_features: with torch.no_grad(): encoder_out = models[0](**sample["net_input"]) feat = encoder_out["encoder_out"].transpose( 0, 1).cpu().numpy() for i, id in enumerate(sample["id"]): padding = (encoder_out["encoder_padding_mask"][i].cpu( ).numpy() if encoder_out["encoder_padding_mask"] is not None else None) features[id.item()] = (feat[i], padding) continue hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]["tokens"]) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample["id"].tolist()): speaker = None # id = task.dataset(args.gen_subset).ids[int(sample_id)] id = sample_id toks = (sample["target"][i, :] if "target_label" not in sample else sample["target_label"][i, :]) target_tokens = utils.strip_pad(toks, tgt_dict.pad()).int().cpu() # Process top predictions errs, length = process_predictions( args, hypos[i], None, tgt_dict, target_tokens, res_files, speaker, id, ) errs_t += errs lengths_t += length wps_meter.update(num_generated_tokens) t.log({"wps": round(wps_meter.avg)}) num_sentences += (sample["nsentences"] if "nsentences" in sample else sample["id"].numel()) wer = None if args.dump_emissions: emm_arr = [] for i in range(len(emissions)): emm_arr.append(emissions[i]) np.save(args.dump_emissions, emm_arr) logger.info( f"saved {len(emissions)} emissions to {args.dump_emissions}") elif args.dump_features: feat_arr = [] for i in range(len(features)): feat_arr.append(features[i]) np.save(args.dump_features, feat_arr) logger.info(f"saved {len(features)} emissions to {args.dump_features}") else: if lengths_t > 0: wer = errs_t * 100.0 / lengths_t logger.info(f"WER: {wer}") logger.info("| Processed {} sentences ({} tokens) in {:.1f}s ({:.2f}" "sentences/s, {:.2f} tokens/s)".format( num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1.0 / gen_timer.avg, )) logger.info("| Generate {} with beam={}".format( args.gen_subset, args.beam)) return task, wer
def validate_translation(args, trainer, task, epoch_itr, generator): src_dict = task.source_dictionary tgt_dict = task.target_dictionary models = [trainer.get_model()] bleu_dict = {key: None for key in task.eval_lang_pairs} # Generate and compute BLEU score if args.sacrebleu: scorer_dict = { key: bleu.SacrebleuScorer() for key in task.eval_lang_pairs } else: scorer_dict = { key: bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) for key in task.eval_lang_pairs } itr = task.get_batch_iterator( dataset=task.dataset('valid'), max_tokens=args.max_tokens_valid, max_sentences=args.max_sentences_valid, max_positions=utils.resolve_max_positions( task.max_positions(), trainer.get_model().max_positions(), ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, noskip=True, ).next_epoch_itr(shuffle=False) progress = progress_bar.build_progress_bar(args, itr, epoch_itr.epoch, prefix='translate subset', no_progress_bar='simple') num_sentences = 0 has_target = True #with progress_bar.build_progress_bar(args, itr) as t: for samples in progress: if torch.cuda.is_available() and not args.cpu: samples = utils.move_to_cuda(samples) #if 'net_input' not in samples: # continue prefix_tokens = None for key, sample in samples.items(): hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None target_tokens = None if has_target: target_tokens = utils.strip_pad( sample['target'][i, :], tgt_dict.pad()).int().cpu() # Remove padding if args.sde: src_tokens = target_tokens else: src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) # Either retrieve the original sentences or regenerate them from tokens. #if src_dict is not None: # src_str = src_dict.string(src_tokens, args.remove_bpe) #else: # src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) #if not args.quiet: # if src_dict is not None: # print('S-{}\t{}'.format(sample_id, src_str)) # if has_target: # print('T-{}\t{}'.format(sample_id, target_str)) # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str="", alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=None, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) #if not args.quiet: # print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], hypo_str)) # print('P-{}\t{}'.format( # sample_id, # ' '.join(map( # lambda x: '{:.4f}'.format(x), # hypo['positional_scores'].tolist(), # )) # )) # if args.print_alignment: # print('A-{}\t{}'.format( # sample_id, # ' '.join(map(lambda x: str(utils.item(x)), alignment)) # )) # Score only the top hypothesis if has_target and j == 0: if args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True) if hasattr(scorer_dict[key], 'add_string'): scorer_dict[key].add_string(target_str, hypo_str) else: scorer_dict[key].add(target_tokens, hypo_tokens) num_sentences += sample['nsentences'] for key, scorer in scorer_dict.items(): bleu_dict[key] = scorer.score() return bleu_dict
def main(args): assert args.path is not None, '--path required for generation!' assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert args.replace_unk is None or args.raw_text, \ '--replace-unk requires a raw text dataset (--raw-text)' utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 print(args) use_cuda = torch.cuda.is_available() and not args.cpu # # Load dataset splits # task = tasks.setup_task(args) # task.load_dataset(args.gen_subset) # # # Set dictionaries # try: # src_dict = getattr(task, 'source_dictionary', None) # except NotImplementedError: # src_dict = None # tgt_dict = task.target_dictionary task = None # Load ensemble print('| loading model(s) from {}'.format(args.path)) models, _model_args, task = checkpoint_utils.load_model_ensemble( args.path.split(':'), arg_overrides=eval(args.model_overrides), task=task, ) print(_model_args) task.load_dataset(args.gen_subset) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models] ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) # Generate and compute BLEU score if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) num_sentences = 0 has_target = True with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: sample = prepare_sample(_model_args, task, sample, use_cuda) if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad(sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad(sample['target'][i, :], tgt_dict.pad()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset(args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset(args.gen_subset).tgt.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) else: src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) if not args.quiet: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str)) if has_target: print('T-{}\t{}'.format(sample_id, target_str)) # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) if not args.quiet: print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], hypo_str)) print('P-{}\t{}'.format( sample_id, ' '.join(map( lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist(), )) )) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join(['{}-{}'.format(src_idx, tgt_idx) for src_idx, tgt_idx in alignment]) )) if args.print_step: print('I-{}\t{}'.format(sample_id, hypo['steps'])) # Score only the top hypothesis if has_target and j == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line(target_str, add_if_not_exist=True) if hasattr(scorer, 'add_string'): scorer.add_string(target_str, hypo_str) else: scorer.add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) t.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] print('| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'.format( num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string())) return scorer
def validate(args, trainer, task, epoch_itr, subsets, prune=-1): """Evaluate the model on the validation set(s) and return the losses.""" if args.fixed_validation_seed is not None: # set fixed seed for every validation utils.set_torch_seed(args.fixed_validation_seed) valid_losses = [] for subset in subsets: # Initialize data iterator itr = task.get_batch_iterator( dataset=task.dataset(subset), max_tokens=args.max_tokens_valid, max_sentences=args.max_sentences_valid, max_positions=utils.resolve_max_positions( task.max_positions(), trainer.get_model().max_positions(), ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args.log_format, log_interval=args.log_interval, epoch=epoch_itr.epoch, prefix=f"valid on '{subset}' subset", tensorboard_logdir=(args.tensorboard_logdir if distributed_utils.is_master(args) else None), default_log_format=('tqdm' if not args.no_progress_bar else 'simple'), ) # added by Junxian if prune > 0: index_map = trainer.get_model().set_prune_index(prune) task.set_index_map(index_map) # not write templates for time profiling write_template_flag = False if args.eval_mode == 'time' else True # only one worker deals with the template file in DDP if args.distributed_rank == 0 and write_template_flag: print('write template files') if args.eval_mode == 'none': fout = open( os.path.join( args.save_dir, 'templates_{}_{}.txt'.format( epoch_itr.epoch, trainer.get_num_updates())), 'w') else: fout = open( os.path.join(args.save_dir, 'templates_eval_{}.txt'.format(subset)), 'w') if prune <= 0: task.write_lambda(fout, trainer.get_model()) else: fout = None # create a new root metrics aggregator so validation metrics # don't pollute other aggregators (e.g., train meters) with metrics.aggregate(new_root=True) as agg: for sample in progress: trainer.valid_step(sample, split=subset) # added by Junxian if args.distributed_rank == 0: task.write_template(sample, trainer.get_model(), fout) if fout is not None: fout.close() # log validation stats stats = get_valid_stats(args, trainer, agg.get_smoothed_values()) progress.print(stats, tag=subset, step=trainer.get_num_updates()) valid_losses.append(stats[args.best_checkpoint_metric]) return valid_losses
def main(args): assert args.path is not None, '--path required for generation!' assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert args.replace_unk is None or args.raw_text, \ '--replace-unk requires a raw text dataset (--raw-text)' if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) print('| {} {} {} examples'.format(args.data, args.gen_subset, len(task.dataset(args.gen_subset)))) # Set dictionaries src_dict = task.source_dictionary tgt_dict = task.target_dictionary # Load ensemble print('| loading model(s) from {}'.format(args.path)) models, _ = utils.load_ensemble_for_inference(args.path.split(':'), task, model_arg_overrides=eval( args.model_overrides)) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if isinstance(task, MultilingualTranslationTask): for i in range(len(models)): models[i] = models[i].models[task.lang_pair] # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) models_max_positions = [model.max_positions() for model in models] if isinstance(models_max_positions[0], dict): new_max_positions = [] for max_p in models_max_positions: for _, val in max_p.items(): new_max_positions += [val] models_max_positions = new_max_positions itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( max(task.max_positions(), *models_max_positions)), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=8, num_shards=args.num_shards, shard_id=args.shard_id, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() if args.score_reference: translator = SequenceScorer(models, task.target_dictionary) else: translator = SequenceGenerator( models, task.target_dictionary, beam_size=args.beam, minlen=args.min_len, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen, unk_penalty=args.unkpen, sampling=args.sampling, sampling_topk=args.sampling_topk, sampling_temperature=args.sampling_temperature, diverse_beam_groups=args.diverse_beam_groups, diverse_beam_strength=args.diverse_beam_strength, ) if use_cuda: translator.cuda() # Generate and compute BLEU score scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) num_sentences = 0 has_target = True with progress_bar.build_progress_bar(args, itr) as t: if args.score_reference: translations = translator.score_batched_itr(t, cuda=use_cuda, timer=gen_timer) else: translations = translator.generate_batched_itr( t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda=use_cuda, timer=gen_timer, prefix_size=args.prefix_size, ) wps_meter = TimeMeter() for sample_id, src_tokens, target_tokens, hypos in translations: # Process input and ground truth has_target = target_tokens is not None target_tokens = target_tokens.int().cpu() if has_target else None # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( args.gen_subset).tgt.get_original_text(sample_id) else: src_str = src_dict.string(src_tokens, args.remove_bpe) if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) if not args.quiet: print('S-{}\t{}'.format(sample_id, src_str)) if has_target: if args.char_tokenized: print('T-{}\t{}'.format( sample_id, target_str.replace(" ", "").replace("|", " "))) else: print('T-{}\t{}'.format(sample_id, target_str)) # Process top predictions for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) if not args.quiet: if args.char_tokenized: print('H-{}\t{}\t{}'.format( sample_id, hypo['score'], hypo_str.replace(" ", "").replace("|", " "))) else: print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], hypo_str)) # print("HT-{}\t{}\t{}".format(sample_id, hypo['score'], hypo['tokens'].int().cpu())) # print('P-{}\t{}'.format( # sample_id, # ' '.join(map( # lambda x: '{:.4f}'.format(x), # hypo['positional_scores'].tolist(), # )) # )) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join( map(lambda x: str(utils.item(x)), alignment)))) # Score only the top hypothesis if has_target and i == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tokenizer.Tokenizer.tokenize( target_str, tgt_dict, add_if_not_exist=True) if args.char_tokenized: # for character tokenization tgt_tok_dic = data.Dictionary.load( f"{args.word_level_dict_path}/dict.{args.target_lang}.txt" ) target_tokens = tokenizer.Tokenizer.tokenize( target_str.replace(" ", "").replace("|", " "), tgt_tok_dic, add_if_not_exist=True) hypo_tokens = tokenizer.Tokenizer.tokenize( hypo_str.replace(" ", "").replace("|", " "), tgt_tok_dic, add_if_not_exist=True) scorer.add(target_tokens, hypo_tokens) wps_meter.update(src_tokens.size(0)) t.log({'wps': round(wps_meter.avg)}) num_sentences += 1 print( '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string()))
def main(args, override_args=None): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' use_fp16 = args.fp16 use_cuda = torch.cuda.is_available() and not args.cpu if override_args is not None: overrides = vars(override_args) overrides.update(eval(getattr(override_args, 'model_overrides', '{}'))) else: overrides = None # Load ensemble logger.info('loading model(s) from {}'.format(args.path)) models, model_args, task = checkpoint_utils.load_model_ensemble_and_task( [args.path], arg_overrides=overrides, suffix=getattr(args, "checkpoint_suffix", ""), ) model = models[0] # Move models to GPU for model in models: if use_fp16: model.half() if use_cuda: model.cuda() # Print args logger.info(model_args) # Build criterion criterion = task.build_criterion(model_args) criterion.eval() for subset in args.valid_subset.split(','): try: task.load_dataset(subset, combine=False, epoch=1) dataset = task.dataset(subset) except KeyError: raise Exception('Cannot find dataset: ' + subset) # Initialize data iterator itr = task.get_batch_iterator( dataset=dataset, max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[m.max_positions() for m in models], ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, seed=args.seed, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args.log_format, log_interval=args.log_interval, prefix=f"valid on '{subset}' subset", default_log_format=('tqdm' if not args.no_progress_bar else 'simple'), ) log_outputs = [] for i, sample in enumerate(progress): sample = utils.move_to_cuda(sample) if use_cuda else sample _loss, _sample_size, log_output = task.valid_step( sample, model, criterion) progress.log(log_output, step=i) log_outputs.append(log_output) with metrics.aggregate() as agg: task.reduce_metrics(log_outputs, criterion) log_output = agg.get_smoothed_values() progress.print(log_output, tag=subset, step=i)
def validate(args, trainer, task, epoch_itr, subsets, experiment=None): """Evaluate the model on the validation set(s) and return the losses.""" if args.fixed_validation_seed is not None: # set fixed seed for every validation utils.set_torch_seed(args.fixed_validation_seed) valid_losses = [] for subset in subsets: # Initialize data iterator itr = task.get_batch_iterator( dataset=task.dataset(subset), max_tokens=args.max_tokens_valid, max_sentences=args.max_sentences_valid, max_positions=utils.resolve_max_positions( task.max_positions(), trainer.get_model().max_positions()), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) progress = progress_bar.build_progress_bar( args, itr, epoch_itr.epoch, prefix="valid on '{}' subset".format(subset), no_progress_bar="simple", ) # reset validation loss meters for k in ["valid_loss", "valid_nll_loss"]: meter = trainer.get_meter(k) if meter is not None: meter.reset() extra_meters = collections.defaultdict(lambda: AverageMeter()) for sample in progress: log_output = trainer.valid_step(sample) for k, v in log_output.items(): if k in [ "loss", "nll_loss", "ntokens", "nsentences", "sample_size" ]: continue extra_meters[k].update(v) # log validation stats stats = get_valid_stats(trainer, args, extra_meters) for k, meter in extra_meters.items(): stats[k] = meter.avg progress.print(stats, tag=subset, step=trainer.get_num_updates()) if experiment: experiment.log_metrics(stats, prefix="validation_{}".format(subset), step=stats["num_updates"]) valid_losses.append(stats[args.best_checkpoint_metric].avg if args. best_checkpoint_metric == "loss" else stats[args.best_checkpoint_metric]) return valid_losses
def main(args): if args.buffer_size < 1: args.buffer_size = 1 if args.max_tokens is None and args.max_sentences is None: args.max_sentences = 1 assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert not args.max_sentences or args.max_sentences <= args.buffer_size, \ '--max-sentences/--batch-size cannot be larger than --buffer-size' print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Setup task, e.g., translation task = tasks.setup_task(args) # Load ensemble print('| loading model(s) from {}'.format(args.path)) model_paths = args.path.split(':') models, model_args = utils.load_ensemble_for_inference(model_paths, task, model_arg_overrides=eval(args.model_overrides)) # Set dictionaries tgt_dict = task.target_dictionary # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() # Initialize generator translator = SequenceGenerator( models, tgt_dict, beam_size=args.beam, minlen=args.min_len, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen, unk_penalty=args.unkpen, sampling=args.sampling, sampling_topk=args.sampling_topk, sampling_temperature=args.sampling_temperature, diverse_beam_groups=args.diverse_beam_groups, diverse_beam_strength=args.diverse_beam_strength, ) if use_cuda: translator.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) def make_result(src_str, hypos): result = Translation( src_str='O\t{}'.format(src_str), hypos=[], pos_scores=[], alignments=[], ) # Process top predictions for hypo in hypos[:min(len(hypos), args.nbest)]: hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) result.hypos.append('H\t{}\t{}'.format(hypo['score'], hypo_str)) result.pos_scores.append('P\t{}'.format( ' '.join(map( lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist(), )) )) result.alignments.append( 'A\t{}'.format(' '.join(map(lambda x: str(utils.item(x)), alignment))) if args.print_alignment else None ) return result def process_batch(batch): tokens = batch.tokens lengths = batch.lengths if use_cuda: tokens = tokens.cuda() lengths = lengths.cuda() translations = translator.generate( tokens, lengths, maxlen=int(args.max_len_a * tokens.size(1) + args.max_len_b), ) return [make_result(batch.srcs[i], t) for i, t in enumerate(translations)] max_positions = utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models] ) if args.buffer_size > 1: print('| Sentence buffer size:', args.buffer_size) print('| Type the input sentence and press return:') for inputs in buffered_read(args.buffer_size): indices = [] results = [] for batch, batch_indices in make_batches(inputs, args, task, max_positions): indices.extend(batch_indices) results += process_batch(batch) for i in np.argsort(indices): result = results[i] print(result.src_str) for hypo, pos_scores, align in zip(result.hypos, result.pos_scores, result.alignments): print(hypo) print(pos_scores) if align is not None: print(align)
def validate(args, trainer, task, epoch_itr, subsets): """Evaluate the model on the validation set(s) and return the losses.""" valid_losses = OrderedDict() valid_bleus = OrderedDict() valid_select = task.args.valid_select[0] assert len(subsets) == 1 for subset in subsets: # Initialize data iterator valid_loss_all = [] valid_nll_loss_all = [] valid_bleu_all = [] for k in ['valid_loss', 'valid_nll_loss', 'valid_bleu']: meter = trainer.get_meter(k + '_all') meter.reset() for domain, data_valid in task.dataset(subset).items(): itr = task.get_batch_iterator_valid( dataset=data_valid, max_tokens=args.max_tokens, max_sentences=args.max_sentences_valid, max_positions=utils.resolve_max_positions( task.max_positions(), trainer.get_model().max_positions(), ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) progress = progress_bar.build_progress_bar( args, itr, epoch_itr.epoch, prefix='valid on \'{}\' subset \'{}\' domain'.format( subset, domain), no_progress_bar='simple') # reset validation loss meters for k in ['valid_loss', 'valid_nll_loss', 'valid_bleu']: meter = trainer.get_meter(k + '_' + domain) meter.reset() extra_meters = collections.defaultdict(lambda: AverageMeter()) src_target_hypo_strs = [] for sample in progress: log_output, src_target_hypo_str = trainer.valid_step( sample, domain=domain) src_target_hypo_strs.extend(src_target_hypo_str) for k, v in log_output.items(): if k in [ 'loss', 'nll_loss', 'ntokens', 'nsentences', 'sample_size' ]: continue extra_meters[k].update(v) src_target_hypo_strs_filter = [] for sents in src_target_hypo_strs: for sent in sents: if sent is None or len(sent) == 0: continue src_target_hypo_strs_filter.append(sent) src_target_hypo_strs_filter = sorted(src_target_hypo_strs_filter, key=lambda elem: int(elem[0]), reverse=False) if args.valid_decoding_path is not None: with open( os.path.join( args.valid_decoding_path, domain, 'decoding_{}.txt'.format(args.distributed_rank)), 'w') as f: for sent in src_target_hypo_strs_filter: if len(sent) == 0: continue f.write(sent[-1] + '\n') num_ref = args.num_ref[domain] ref_path = [] for i in range(int(num_ref)): ref_path.append( os.path.join(args.valid_decoding_path, domain, 'valid.tok.' + args.target_lang + str(i))) valid_decoding_path = os.path.join( args.valid_decoding_path, domain, 'decoding_{}.txt'.format(args.distributed_rank)) with open(valid_decoding_path) as out_file: out_file.seek(0) res = subprocess.check_output( 'perl %s/multi-bleu.perl %s' % (args.multi_bleu_path, ' '.join(ref_path)), stdin=out_file, shell=True).decode("utf-8") trainer.get_meter('valid_bleu_' + domain).update( float(res.split(',')[0].split('=')[1]), 1.0) # log validation stats stats = get_valid_stats(trainer, domain=domain, valid_select=valid_select) for k in ['loss', 'nll_loss', 'bleu']: stats[k] = stats[k].avg for k, meter in extra_meters.items(): stats[k] = meter.avg progress.print(stats, tag=os.path.join(subset, domain), step=trainer.get_num_updates()) valid_losses.update({domain: stats['loss']}) valid_bleus.update({domain: stats['bleu']}) valid_loss_all.append(stats['loss']) valid_nll_loss_all.append(stats['nll_loss']) valid_bleu_all.append(stats['bleu']) trainer.get_meter('valid_loss_all').update(np.mean(valid_loss_all), 1.0) trainer.get_meter('valid_nll_loss_all').update( np.mean(valid_nll_loss_all), 1.0) trainer.get_meter('valid_bleu_all').update(np.mean(valid_bleu_all), 1.0) stats = get_valid_stats(trainer, domain='all', valid_select=valid_select) for k in ['loss', 'nll_loss', 'bleu']: stats[k] = stats[k].avg progress = progress_bar.build_progress_bar( args, [0], epoch_itr.epoch, prefix='valid on \'{}\' subset \'{}\' domain'.format( subset, 'all'), no_progress_bar='simple') progress.print(stats, tag=os.path.join(subset, 'all'), step=trainer.get_num_updates()) valid_losses.update({'all': stats['loss']}) valid_bleus.update({'all': stats['bleu']}) return valid_losses, valid_bleus
def main(args): import_user_module(args) if args.buffer_size < 1: args.buffer_size = 1 if args.max_tokens is None and args.max_sentences is None: args.max_sentences = 1 assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert not args.max_sentences or args.max_sentences <= args.buffer_size, \ '--max-sentences/--batch-size cannot be larger than --buffer-size' print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Setup task, e.g., translation task = tasks.setup_task(args) # Load ensemble print('| loading model(s) from {}'.format(args.path)) models, _model_args = utils.load_ensemble_for_inference( args.path.split(':'), task, model_arg_overrides=eval(args.model_overrides), ) # Set dictionaries src_dict = task.source_dictionary ctx_dict = task.context_dictionary # [CONTEXT] tgt_dict = task.target_dictionary # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Initialize generator generator = task.build_generator(args) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) max_positions = utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models] ) if args.buffer_size > 1: print('| Sentence buffer size:', args.buffer_size) print('| Type the input sentence and press return:') start_id = 0 for inputs in buffered_read(args.input, args.buffer_size): results = [] for batch in make_batches(inputs, args, task, max_positions): src_tokens = batch.src_tokens src_lengths = batch.src_lengths # [CONTEXT] ctx_tokens = batch.ctx_tokens ctx_lengths = batch.ctx_lengths if use_cuda: src_tokens = src_tokens.cuda() src_lengths = src_lengths.cuda() # [CONTEXT] ctx_tokens = ctx_tokens.cuda() ctx_lengths = ctx_lengths.cuda() sample = { 'net_input': { 'src_tokens': src_tokens, 'src_lengths': src_lengths, # [CONTEXT] 'ctx_tokens': ctx_tokens, 'ctx_lengths': ctx_lengths, }, } translations = task.inference_step(generator, models, sample) for i, (id, hypos) in enumerate(zip(batch.ids.tolist(), translations)): ctx_tokens_i = utils.strip_pad(ctx_tokens[i], tgt_dict.pad()) # [CONTEXT] src_tokens_i = utils.strip_pad(src_tokens[i], tgt_dict.pad()) # [CONTEXT] # results.append((start_id + id, src_tokens_i, hypos)) results.append((start_id + id, ctx_tokens_i, src_tokens_i, hypos)) # sort output to match input order # [CONTEXT] # for id, src_tokens, hypos in sorted(results, key=lambda x: x[0]): for id, ctx_tokens, src_tokens, hypos in sorted(results, key=lambda x: x[0]): # [CONTEXT] if ctx_dict is not None: ctx_str = ctx_dict.string(ctx_tokens, args.remove_bpe) print('C-{}\t{}'.format(id, ctx_str)) if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) print('S-{}\t{}'.format(id, src_str)) # Process top predictions for hypo in hypos[:min(len(hypos), args.nbest)]: hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) print('H-{}\t{}\t{}'.format(id, hypo['score'], hypo_str)) print('P-{}\t{}'.format( id, ' '.join(map(lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist())) )) if args.print_alignment: print('A-{}\t{}'.format( id, ' '.join(map(lambda x: str(utils.item(x)), alignment)) )) # update running id counter start_id += len(results)
def main(args, init_distributed=False): import_user_module(args) if args.max_tokens is None: args.max_tokens = 6000 print(args) if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(task, ['train', 'valid']) # Initialize distributed training (after data loading) if init_distributed: import socket args.distributed_rank = distributed_utils.distributed_init(args) print('| initialized host {} as rank {}'.format( socket.gethostname(), args.distributed_rank)) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Make a dummy batch to (i) warm the caching allocator and (ii) as a # placeholder DistributedDataParallel when there's an uneven number of # batches per worker. max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) TRAIN_TAG = args.train_domains[0] dummy_batch = task.dataset('train')[TRAIN_TAG].get_dummy_batch( args.max_tokens, max_positions) oom_batch = task.dataset('train')[TRAIN_TAG].get_dummy_batch( 1, max_positions) # Build trainer trainer = Trainer(args, task, model, criterion, dummy_batch, oom_batch) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Initialize dataloader epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, ) # Load the latest checkpoint if one is available if not load_checkpoint(args, trainer, epoch_itr): trainer.dummy_train_step([dummy_batch], domain=TRAIN_TAG) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_bleus = [None] valid_subsets = args.valid_subset.split(',') valid_select = args.valid_select[0] with open(os.path.join(args.tensorboard_logdir, 'args_log.txt'), 'w') as f: for k in args.__dict__.keys(): f.write("'%s':'%s', \n" % (k, args.__dict__[k])) while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses, valid_bleus = validate(args, trainer, task, epoch_itr, valid_subsets) save_checkpoint(args, trainer, epoch_itr, valid_losses, valid_bleus, valid_select) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[valid_select]) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args): if args.max_tokens is None: args.max_tokens = 6000 print(args) if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(task, ['train', 'valid']) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format( sum(p.numel() for p in model.parameters()))) # Make a dummy batch to (i) warm the caching allocator and (ii) as a # placeholder DistributedDataParallel when there's an uneven number of # batches per worker. max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) dummy_batch = task.dataset('train').get_dummy_batch( args.max_tokens, max_positions) oom_batch = task.dataset('train').get_dummy_batch(1, max_positions) # Build trainer trainer = Trainer(args, task, model, criterion, dummy_batch, oom_batch) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Initialize dataloader epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, ) # Load the latest checkpoint if one is available if not load_checkpoint(args, trainer, epoch_itr): trainer.dummy_train_step([dummy_batch]) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def _main(args, output_file): logging.basicConfig( format='%(asctime)s | %(levelname)s | %(name)s | %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, stream=output_file, ) logger = logging.getLogger('fairnr_cli.render') utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Load ensemble logger.info('loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(os.pathsep), arg_overrides=eval(args.model_overrides), task=task, ) # Optimize ensemble for generation for model in models: if args.fp16: model.half() if use_cuda: model.cuda() logging.info(model) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, seed=args.seed, num_workers=args.num_workers).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) shard_id, world_size = args.distributed_rank, args.distributed_world_size output_files = [] if generator.test_poses is not None: total_frames = generator.test_poses.shape[0] _frames = int(np.floor(total_frames / world_size)) step = shard_id * _frames frames = _frames if shard_id < (world_size - 1) else total_frames - step else: step = shard_id * args.render_num_frames frames = args.render_num_frames with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for i, sample in enumerate(t): sample = utils.move_to_cuda(sample) if use_cuda else sample gen_timer.start() step, _output_files = task.inference_step(generator, models, [sample, step, frames]) output_files += _output_files gen_timer.stop(500) wps_meter.update(500) t.log({'wps': round(wps_meter.avg)}) timestamp = generator.save_images( output_files, steps='shard{}'.format(shard_id), combine_output=args.render_combine_output) # join videos from all GPUs and delete temp files try: timestamps = distributed_utils.all_gather_list(timestamp) except: timestamps = [timestamp] if shard_id == 0: generator.merge_videos(timestamps)
def main(args): utils.import_user_module(args) if args.buffer_size < 1: args.buffer_size = 1 if args.max_tokens is None and args.max_sentences is None: args.max_sentences = 1 assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert not args.max_sentences or args.max_sentences <= args.buffer_size, \ '--max-sentences/--batch-size cannot be larger than --buffer-size' logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu # Setup task, e.g., translation task = tasks.setup_task(args) # Load ensemble logger.info('loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(os.pathsep), arg_overrides=eval(args.model_overrides), task=task, ) # Set dictionaries src_dict = task.source_dictionary tgt_dict = task.target_dictionary # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Initialize generator generator = task.build_generator(models, args) # Handle tokenization and BPE tokenizer = encoders.build_tokenizer(args) bpe = encoders.build_bpe(args) def encode_fn(x): if tokenizer is not None: x = tokenizer.encode(x) if bpe is not None: x = bpe.encode(x) return x def decode_fn(x): if bpe is not None: x = bpe.decode(x) if tokenizer is not None: x = tokenizer.decode(x) return x # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) max_positions = utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models] ) if args.buffer_size > 1: logger.info('Sentence buffer size: %s', args.buffer_size) logger.info('NOTE: hypothesis and token scores are output in base 2') logger.info('Type the input sentence and press return:') start_id = 0 for inputs in buffered_read(args.input, args.buffer_size): results = [] for batch in make_batches(inputs, args, task, max_positions, encode_fn): src_tokens = batch.src_tokens src_lengths = batch.src_lengths if use_cuda: src_tokens = src_tokens.cuda() src_lengths = src_lengths.cuda() sample = { 'net_input': { 'src_tokens': src_tokens, 'src_lengths': src_lengths, }, } translations = task.inference_step(generator, models, sample) for i, (id, hypos) in enumerate(zip(batch.ids.tolist(), translations)): src_tokens_i = utils.strip_pad(src_tokens[i], tgt_dict.pad()) results.append((start_id + id, src_tokens_i, hypos)) # sort output to match input order for id, src_tokens, hypos in sorted(results, key=lambda x: x[0]): if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) print('S-{}\t{}'.format(id, src_str)) # Process top predictions for hypo in hypos[:min(len(hypos), args.nbest)]: hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) detok_hypo_str = decode_fn(hypo_str) score = hypo['score'] / math.log(2) # convert to base 2 # original hypothesis (after tokenization and BPE) print('H-{}\t{}\t{}'.format(id, score, hypo_str)) # detokenized hypothesis print('D-{}\t{}\t{}'.format(id, score, detok_hypo_str)) print('P-{}\t{}'.format( id, ' '.join(map( lambda x: '{:.4f}'.format(x), # convert from base e to base 2 hypo['positional_scores'].div_(math.log(2)).tolist(), )) )) if args.print_alignment: alignment_str = " ".join(["{}-{}".format(src, tgt) for src, tgt in alignment]) print('A-{}\t{}'.format( id, alignment_str )) # update running id counter start_id += len(inputs)
def validate(args, trainer, task, epoch_itr, subsets): """Evaluate the model on the validation set(s) and return the losses.""" valid_losses = [] for subset in subsets: # Initialize data iterator itr = task.get_batch_iterator( dataset=task.dataset(subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences_valid, max_positions=utils.resolve_max_positions( task.max_positions(), trainer.get_model().max_positions(), ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) progress = progress_bar.build_progress_bar( args, itr, epoch_itr.epoch, prefix='valid on \'{}\' subset'.format(subset), no_progress_bar='simple' ) # reset validation loss meters for k in ['valid_loss', 'valid_nll_loss']: meter = trainer.get_meter(k) if meter is not None: meter.reset() extra_meters = collections.defaultdict(lambda: AverageMeter()) correct_num = 0 predict_num = 0 target_num = 0 for sample in progress: log_output = trainer.valid_step(sample) for k, v in log_output.items(): if k in ['loss', 'nll_loss', 'ntokens', 'nsentences', 'sample_size']: continue if k=="correct": correct_num += v continue if k=="predict": predict_num += v continue if k=="target": target_num += v continue extra_meters[k].update(v) # log validation stats stats = get_valid_stats(trainer) for k, meter in extra_meters.items(): stats[k] = meter.avg stats["acc"] = correct_num/predict_num stats["recall"] = correct_num/target_num stats["F1"] = (2*stats["acc"]*stats["recall"])/(stats["recall"]+stats["acc"]) stats["acc_num"] = "{}-{}-{}".format(correct_num, predict_num, target_num) progress.print(stats) valid_losses.append(stats['valid_loss']) return valid_losses
def main(parsed_args): assert parsed_args.path is not None, '--path required for evaluation!' import_user_module(parsed_args) print(parsed_args) use_cuda = torch.cuda.is_available() and not parsed_args.cpu task = tasks.setup_task(parsed_args) # Load ensemble print('| loading model(s) from {}'.format(parsed_args.path)) models, args = utils.load_ensemble_for_inference( parsed_args.path.split(':'), task, model_arg_overrides=parsed_args.model_overrides, ) for arg in vars(parsed_args).keys(): if arg not in { 'self_target', 'future_target', 'past_target', 'tokens_per_sample', 'output_size_dictionary' }: setattr(args, arg, getattr(parsed_args, arg)) task = tasks.setup_task(args) # Load dataset splits task.load_dataset(args.gen_subset) print('| {} {} {} examples'.format(args.data, args.gen_subset, len(task.dataset(args.gen_subset)))) # Optimize ensemble for generation and set the source and dest dicts on the model (required by scorer) for model in models: model.make_generation_fast_() if args.fp16: model.half() if use_cuda: model.cuda() assert len(models) > 0 print('num. model params: {}'.format( sum(p.numel() for p in models[0].parameters()))) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens or 36000, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( *[model.max_positions() for model in models]), ignore_invalid_inputs=True, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) gen_timer = StopwatchMeter() scorer = SequenceScorer(task.target_dictionary) score_sum = 0. count = 0 if args.remove_bpe is not None: if args.remove_bpe == 'sentencepiece': raise NotImplementedError else: bpe_cont = args.remove_bpe.rstrip() bpe_toks = set(i for i in range(len(task.dictionary)) if task.dictionary[i].endswith(bpe_cont)) bpe_len = len(bpe_cont) else: bpe_toks = None bpe_len = 0 word_stats = dict() with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue gen_timer.start() hypos = scorer.generate(models, sample) gen_timer.stop(sample['ntokens']) for hypos_i in hypos: hypo = hypos_i[0] pos_scores = hypo['positional_scores'] skipped_toks = 0 if bpe_toks is not None: for i in range(len(hypo['tokens']) - 1): if hypo['tokens'][i].item() in bpe_toks: skipped_toks += 1 pos_scores[i + 1] += pos_scores[i] pos_scores[i] = 0 inf_scores = pos_scores.eq(float('inf')) | pos_scores.eq( float('-inf')) if inf_scores.any(): print( '| Skipping tokens with inf scores:', task.target_dictionary.string( hypo['tokens'][inf_scores.nonzero()])) pos_scores = pos_scores[(~inf_scores).nonzero()] score_sum += pos_scores.sum().cpu() count += pos_scores.numel() - skipped_toks if args.output_word_probs or args.output_word_stats: w = '' word_prob = [] is_bpe = False for i in range(len(hypo['tokens'])): w_ind = hypo['tokens'][i].item() w += task.dictionary[w_ind] if bpe_toks is not None and w_ind in bpe_toks: w = w[:-bpe_len] is_bpe = True else: word_prob.append((w, pos_scores[i].item())) next_prob = None ind = i + 1 while ind < len(hypo['tokens']): if pos_scores[ind].item() != 0: next_prob = pos_scores[ind] break ind += 1 word_stats.setdefault(w, WordStat(w, is_bpe)).add( pos_scores[i].item(), next_prob) is_bpe = False w = '' if args.output_word_probs: print('\t'.join('{} [{:2f}]'.format(x[0], x[1]) for x in word_prob)) wps_meter.update(sample['ntokens']) t.log({'wps': round(wps_meter.avg)}) avg_nll_loss = -score_sum / count print('| Evaluated {} tokens in {:.1f}s ({:.2f} tokens/s)'.format( gen_timer.n, gen_timer.sum, 1. / gen_timer.avg)) print('| Loss: {:.4f}, Perplexity: {:.2f}'.format(avg_nll_loss, np.exp(avg_nll_loss))) if args.output_word_stats: for ws in sorted(word_stats.values(), key=lambda x: x.count, reverse=True): print(ws)
def _main(args, output_file): logging.basicConfig( format='%(asctime)s | %(levelname)s | %(name)s | %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, stream=output_file, ) logger = logging.getLogger('fairseq_cli.generate') utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble logger.info('loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(os.pathsep), arg_overrides=eval(args.model_overrides), task=task, ) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) # Generate and compute BLEU score if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) num_sentences = 0 has_target = True with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad( sample['target'][i, :], tgt_dict.pad()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( args.gen_subset).tgt.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) else: src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) if not args.quiet: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str), file=output_file) if has_target: print('T-{}\t{}'.format(sample_id, target_str), file=output_file) # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) if not args.quiet: score = hypo['score'] / math.log( 2) # convert to base 2 print('H-{}\t{}\t{}'.format(sample_id, score, hypo_str), file=output_file) print( 'P-{}\t{}'.format( sample_id, ' '.join( map( lambda x: '{:.4f}'.format(x), # convert from base e to base 2 hypo['positional_scores'].div_( math.log(2)).tolist(), ))), file=output_file) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join([ '{}-{}'.format(src_idx, tgt_idx) for src_idx, tgt_idx in alignment ])), file=output_file) if args.print_step: print('I-{}\t{}'.format(sample_id, hypo['steps']), file=output_file) if getattr(args, 'retain_iter_history', False): for step, h in enumerate(hypo['history']): _, h_str, _ = utils.post_process_prediction( hypo_tokens=h['tokens'].int().cpu(), src_str=src_str, alignment=None, align_dict=None, tgt_dict=tgt_dict, remove_bpe=None, ) print('E-{}_{}\t{}'.format( sample_id, step, h_str), file=output_file) # Score only the top hypothesis if has_target and j == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True) if hasattr(scorer, 'add_string'): scorer.add_string(target_str, hypo_str) else: scorer.add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) t.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] logger.info('NOTE: hypothesis and token scores are output in base 2') logger.info( 'Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: logger.info('Generate {} with beam={}: {}'.format( args.gen_subset, args.beam, scorer.result_string())) return scorer
def model_fn(model_dir): model_name = 'checkpoint_best.pt' model_path = os.path.join(model_dir, model_name) logger.info('Loading the model') with open(model_path, 'rb') as f: model_info = torch.load(f, map_location=torch.device('cpu')) # Will be overidden by the model_info['args'] - need to keep for pre-trained models parser = options.get_generation_parser(interactive=True) # get args for FairSeq by converting the hyperparameters as if they were command-line arguments argv_copy = copy.deepcopy(sys.argv) # remove the modifications we did in the command-line arguments sys.argv[1:] = ['--path', model_path, model_dir] args = options.parse_args_and_arch(parser) # restore previous command-line args sys.argv = argv_copy saved_args = model_info['args'] for key, value in vars(saved_args).items(): setattr(args, key, value) args.data = [model_dir] print(args) # Setup task, e.g., translation task = tasks.setup_task(args) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logger.info('Current device: {}'.format(device)) model_paths = [os.path.join(model_dir, model_name)] models, model_args = utils.load_ensemble_for_inference(model_paths, task, model_arg_overrides={}) # Set dictionaries tgt_dict = task.target_dictionary # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() # Initialize generator translator = SequenceGenerator( models, tgt_dict, beam_size=args.beam, minlen=args.min_len, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen, unk_penalty=args.unkpen, sampling=args.sampling, sampling_topk=args.sampling_topk, sampling_temperature=args.sampling_temperature, diverse_beam_groups=args.diverse_beam_groups, diverse_beam_strength=args.diverse_beam_strength, ) if device.type == 'cuda': translator.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) # align_dict = utils.load_align_dict(args.replace_unk) align_dict = utils.load_align_dict(None) max_positions = utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models] ) return dict( translator=translator, task=task, max_positions=max_positions, align_dict=align_dict, tgt_dict=tgt_dict, args=args, device=device, )
def main(parsed_args): assert parsed_args.path is not None, '--path required for evaluation!' print(parsed_args) use_cuda = torch.cuda.is_available() and not parsed_args.cpu task = tasks.setup_task(parsed_args) # Load ensemble print('| loading model(s) from {}'.format(parsed_args.path)) models, args = utils.load_ensemble_for_inference(parsed_args.path.split(':'), task) args.__dict__.update(parsed_args.__dict__) print(args) task.args = args # Load dataset splits task.load_dataset(args.gen_subset) print('| {} {} {} examples'.format(args.data, args.gen_subset, len(task.dataset(args.gen_subset)))) # Optimize ensemble for generation and set the source and dest dicts on the model (required by scorer) for model in models: model.make_generation_fast_() if args.fp16: model.half() assert len(models) > 0 itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens or 36000, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions(*[ model.max_positions() for model in models ]), num_shards=args.num_shards, shard_id=args.shard_id, ignore_invalid_inputs=True, ).next_epoch_itr(shuffle=False) gen_timer = StopwatchMeter() scorer = SequenceScorer(models, task.target_dictionary) if use_cuda: scorer.cuda() score_sum = 0. count = 0 if args.remove_bpe is not None: bpe_cont = args.remove_bpe.rstrip() bpe_toks = set(i for i in range(len(task.dictionary)) if task.dictionary[i].endswith(bpe_cont)) bpe_len = len(bpe_cont) else: bpe_toks = None bpe_len = 0 word_stats = dict() with progress_bar.build_progress_bar(args, itr) as t: results = scorer.score_batched_itr(t, cuda=use_cuda, timer=gen_timer) wps_meter = TimeMeter() for _, src_tokens, __, hypos in results: for hypo in hypos: pos_scores = hypo['positional_scores'] skipped_toks = 0 if bpe_toks is not None: for i in range(len(hypo['tokens']) - 1): if hypo['tokens'][i].item() in bpe_toks: skipped_toks += 1 pos_scores[i + 1] += pos_scores[i] pos_scores[i] = 0 inf_scores = pos_scores.eq(float('inf')) | pos_scores.eq(float('-inf')) if inf_scores.any(): print('| Skipping tokens with inf scores:', task.target_dictionary.string(hypo['tokens'][inf_scores.nonzero()])) pos_scores = pos_scores[(~inf_scores).nonzero()] score_sum += utils.item(pos_scores.sum()) count += pos_scores.numel() - skipped_toks if args.output_word_probs or args.output_word_stats: w = '' word_prob = [] is_bpe = False for i in range(len(hypo['tokens'])): w_ind = hypo['tokens'][i].item() w += task.dictionary[w_ind] if bpe_toks is not None and w_ind in bpe_toks: w = w[:-bpe_len] is_bpe = True else: word_prob.append((w, pos_scores[i].item())) word_stats.setdefault(w, WordStat(w, is_bpe)).add(pos_scores[i].item()) is_bpe = False w = '' if args.output_word_probs: print('\t'.join('{} [{:2f}]'.format(x[0], x[1]) for x in word_prob)) wps_meter.update(src_tokens.size(0)) t.log({'wps': round(wps_meter.avg)}) avg_nll_loss = -score_sum / count print('| Evaluated {} tokens in {:.1f}s ({:.2f} tokens/s)'.format(gen_timer.n, gen_timer.sum, 1. / gen_timer.avg)) print('| Loss: {:.4f}, Perplexity: {:.2f}'.format(avg_nll_loss, np.exp(avg_nll_loss))) if args.output_word_stats: for ws in sorted(word_stats.values(), key=lambda x: x.count, reverse=True): print(ws)