def validate(args, trainer, task, epoch_itr, subsets): """Evaluate the model on the validation set(s) and return the losses.""" if args['dataset']['fixed_validation_seed'] is not None: # set fixed seed for every validation set_seed.set_torch_seed(args['dataset']['fixed_validation_seed']) valid_losses = [] for subset in subsets: # Initialize data iterator itr = task.get_batch_iterator( dataset=task.dataset(subset), max_tokens=args['dataset']['max_tokens_valid'], max_sentences=args['dataset']['max_sentences_valid'], max_positions=utils.resolve_max_positions( task.max_positions(), trainer.get_model().max_positions(), ), ignore_invalid_inputs=args['dataset'] ['skip_invalid_size_inputs_valid_test'], required_batch_size_multiple=args['dataset'] ['required_batch_size_multiple'], seed=args['common']['seed'], num_shards=args['distributed_training']['distributed_world_size'], shard_id=args['distributed_training']['distributed_rank'], num_workers=args['dataset']['num_workers'], ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args['common']['log_format'], log_interval=args['common']['log_interval'], epoch=epoch_itr.epoch, prefix=f"valid on '{subset}' subset", tensorboard_logdir=(args['common']['tensorboard_logdir'] if distributed_utils.is_master(args) else None), default_log_format=('tqdm' if not args['common']['no_progress_bar'] else 'simple'), ) # create a new root metrics aggregator so validation metrics # don't pollute other aggregators (e.g., train meters) with metrics.aggregate(new_root=True) as agg: for sample in progress: trainer.valid_step(sample) # log validation stats stats = get_valid_stats(args, trainer, agg.get_smoothed_values()) progress.print(stats, tag=subset, step=trainer.get_num_updates()) valid_losses.append( stats[args['checkpoint']['best_checkpoint_metric']]) return valid_losses
def train(args, trainer, task, epoch_itr): """Train the model for one epoch.""" # Initialize data iterator itr = epoch_itr.next_epoch_itr( fix_batches_to_gpus=args['distributed_training'] ['fix_batches_to_gpus'], # shuffle=(epoch_itr.next_epoch_idx > args['dataset']['curriculum']), shuffle=False, ) update_freq = (args['optimization']['update_freq'][epoch_itr.epoch - 1] if epoch_itr.epoch <= len(args['optimization']['update_freq']) else args['optimization']['update_freq'][-1]) itr = iterators.GroupedIterator(itr, update_freq) progress = progress_bar.progress_bar( itr, log_format=args['common']['log_format'], log_interval=args['common']['log_interval'], epoch=epoch_itr.epoch, tensorboard_logdir=(args['common']['tensorboard_logdir'] if distributed_utils.is_master(args) else None), default_log_format=('tqdm' if not args['common']['no_progress_bar'] else 'simple'), ) # task specific setup per epoch task.begin_epoch(epoch_itr.epoch, trainer.get_model()) valid_subsets = args['dataset']['valid_subset'].split(',') max_update = args['optimization']['max_update'] or math.inf num_updates = 0 # init as 0, for zero-shot learning for samples in progress: with metrics.aggregate('train_inner'): log_output = trainer.train_step(samples) if log_output is None: # OOM, overflow, ... continue # log mid-epoch stats num_updates = trainer.get_num_updates() if num_updates % args['common']['log_interval'] == 0: stats = get_training_stats( metrics.get_smoothed_values('train_inner')) progress.log(stats, tag='train_inner', step=num_updates) # reset epoch-level meters metrics.reset_meters('train_inner') if (not args['dataset']['disable_validation'] and args['checkpoint']['save_interval_updates'] > 0 and num_updates % args['checkpoint']['save_interval_updates'] == 0 and num_updates > 0): valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) if num_updates >= max_update: break # log end-of-epoch stats stats = get_training_stats(metrics.get_smoothed_values('train')) progress.print(stats, tag='train', step=num_updates) # reset epoch-level meters metrics.reset_meters('train')
def validate(args, trainer, task, epoch_itr, subsets): """Evaluate the model on the validation set(s) and return the losses.""" if args['dataset']['fixed_validation_seed'] is not None: # set fixed seed for every validation set_seed.set_torch_seed(args['dataset']['fixed_validation_seed']) valid_losses = [] for subset in subsets: # Initialize data iterator itr = task.get_batch_iterator( dataset=task.dataset(subset), max_tokens=args['dataset']['max_tokens_valid'], max_sentences=args['dataset']['max_sentences_valid'], max_positions=utils.resolve_max_positions( task.max_positions(), trainer.get_model().max_positions(), ), ignore_invalid_inputs=args['dataset'] ['skip_invalid_size_inputs_valid_test'], required_batch_size_multiple=args['dataset'] ['required_batch_size_multiple'], seed=args['common']['seed'], num_shards=args['distributed_training']['distributed_world_size'], shard_id=args['distributed_training']['distributed_rank'], num_workers=args['dataset']['num_workers'], ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args['common']['log_format'], log_interval=args['common']['log_interval'], epoch=epoch_itr.epoch, prefix=f"valid on '{subset}' subset", tensorboard_logdir=(args['common']['tensorboard_logdir'] if distributed_utils.is_master(args) else None), default_log_format=('tqdm' if not args['common']['no_progress_bar'] else 'simple'), ) accs, mrrs, maps, ndcgs = [], [], [], [] trainer.model.eval() trainer.criterion.eval() with torch.no_grad(): for sample in progress: sample = trainer._prepare_sample(sample) inputs = list(sample['net_input'].values()) code_repr = trainer.model.code_forward(*inputs[:6]) desc_repr = trainer.model.desc_forward(*inputs[6:8]) code_repr = code_repr / code_repr.norm(dim=-1, keepdim=True) desc_repr = desc_repr / desc_repr.norm(dim=-1, keepdim=True) similarity = code_repr @ desc_repr.t() acc, mrr, map, ndcg = inference(similarity) accs.append(acc.mean().item()) mrrs.append(mrr.mean().item()) maps.append(map.mean().item()) ndcgs.append(ndcg.mean().item()) accs = round(float(np.mean(accs)), 6) mrrs = round(float(np.mean(mrrs)), 6) maps = round(float(np.mean(maps)), 6) ndcgs = round(float(np.mean(ndcgs)), 6) stats = {'acc': accs, 'mrr': mrrs, 'map': maps, 'ndcg': ndcgs} progress.print(stats, tag=subset, step=trainer.get_num_updates()) valid_losses.append( stats[args['checkpoint']['best_checkpoint_metric']]) return valid_losses
def main(args, out_file=None): use_cuda = torch.cuda.is_available() and not args['common']['cpu'] # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args['dataset']['gen_subset']) # Set dictionaries src_dict = task.source_dictionary tgt_dict = task.target_dictionary # Load ensemble LOGGER.info('loading model(s) from {}'.format(args['eval']['path'])) models, _ = checkpoint_utils.load_model_ensemble( utils.split_paths(args['eval']['path']), arg_overrides=eval(args['eval']['model_overrides']), task=task, ) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args['eval']['no_beamable_mm'] else args['eval']['beam'], need_attn=args['eval']['print_alignment'], ) if use_cuda: device = os.environ.get('CUDA_VISIBALE_DEVICES', [0])[0] # get first device as default torch.cuda.set_device(f'cuda:{device}') model = model.cuda() if args['common']['fp16'] and use_cuda: model.half() # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args['dataset']['gen_subset']), max_tokens=args['dataset']['max_tokens'], max_sentences=args['eval']['max_sentences'], max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args['dataset'] ['skip_invalid_size_inputs_valid_test'], required_batch_size_multiple=args['dataset'] ['required_batch_size_multiple'], num_shards=args['dataset']['num_shards'], shard_id=args['dataset']['shard_id'], num_workers=args['dataset']['num_workers'], ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args['common']['log_format'], log_interval=args['common']['log_interval'], default_log_format=('tqdm' if not args['common']['no_progress_bar'] else 'none'), ) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(models, args) sources, hypotheses, references = dict(), dict(), dict() for sample in progress: torch.cuda.empty_cache() sample = move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue gen_timer.start() hypos = task.inference_step(generator, models, sample, bos_token=tgt_dict.bos()) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) # TODO: warning gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad(sample['target'][i, :], tgt_dict.pad()).int().cpu() hypos_tokens = utils.strip_eos(hypos[i][0]['tokens'], tgt_dict.eos()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if src_dict is not None: src_str = src_dict.string(src_tokens, args['eval']['remove_bpe']) else: src_str = "0" if has_target: target_str = tgt_dict.string(target_tokens, args['eval']['remove_bpe'], escape_unk=True) hypo_str = tgt_dict.string(hypos_tokens, args['eval']['remove_bpe']) sources[sample_id] = [src_str] hypotheses[sample_id] = [hypo_str] references[sample_id] = [target_str] bleu, rouge_l, meteor = \ summarization_metrics.eval_accuracies(hypotheses, references, filename=out_file, mode='test') LOGGER.info('BLEU: {:.2f}\t ROUGE-L: {:.2f}\t METEOR: {:.2f}'.format( bleu, rouge_l, meteor))
def _main(args, output_file): if args['dataset']['max_tokens'] is None and args['dataset'][ 'max_sentences'] is None: args['dataset']['max_tokens'] = 12000 use_cuda = torch.cuda.is_available() and not args['common']['cpu'] if use_cuda: device = os.environ.get('CUDA_VISIBALE_DEVICES', [0])[0] # get first device as default torch.cuda.set_device(f'cuda:{device}') # Load dataset splits task = tasks.setup_task(args) # Load ensemble LOGGER.info('loading model(s) from {}'.format(args['eval']['path'])) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args['eval']['path']), arg_overrides=eval(args['eval']['model_overrides']), task=task, ) # Optimize ensemble for generation for model in models: if _model_args['common']['fp16']: model.half() if use_cuda: model.cuda() sequence_completor = task.build_completor(models, args) subsets = [ args['dataset']['train_subset'], args['dataset']['valid_subset'], args['dataset']['gen_subset'], ] for subset in subsets: task.load_dataset(subset, shuffle=False) task.dataset(subset).shuffle = False # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(subset), max_tokens=args['dataset']['max_tokens'], max_sentences=args['eval']['max_sentences_eval'], max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=_model_args['dataset'] ['skip_invalid_size_inputs_valid_test'], required_batch_size_multiple=_model_args['dataset'] ['required_batch_size_multiple'], num_shards=_model_args['dataset']['num_shards'], shard_id=_model_args['dataset']['shard_id'], num_workers=_model_args['dataset']['num_workers'], ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=_model_args['common']['log_format'], log_interval=_model_args['common']['log_interval'], default_log_format=('tqdm' if not _model_args['common']['no_progress_bar'] else 'none'), ) topk = args['kd']['gen_topk'] out_idx, out_prob = [], [] with torch.no_grad(): for sample in progress: torch.cuda.empty_cache() sample = move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue net_output = sequence_completor.generate([model], sample, prefix_tokens=None) topk_prob, topk_ids = torch.topk(net_output[0], topk, dim=-1) # ignore pad non_padding_mask = sample['net_input'][ 'src_tokens'] != task.target_dictionary.pad() if use_cuda: topk_prob, topk_ids = topk_prob.cpu(), topk_ids.cpu() non_padding_mask = non_padding_mask.cpu() for idx in range(topk_prob.size(0)): out_idx.append( topk_ids[idx, ...][non_padding_mask[idx, ...]].view(-1).tolist()) out_prob.append(topk_prob[idx, ...][non_padding_mask[ idx, ...]].view(-1).tolist()) assert len(out_idx) == len(out_prob) == len(task.dataset(subset)), \ Exception(len(out_idx), len(out_prob), len(task.dataset(subset))) TeacherOutDataset.save_bin( prefix=os.path.join(args['checkpoint']['save_dir'], f'{subset}.top{topk}_idx'), data_list=out_idx, dtype=np.int32, ) TeacherOutDataset.save_bin( prefix=os.path.join(args['checkpoint']['save_dir'], f'{subset}.top{topk}_prob'), data_list=out_prob, dtype=np.float, )
def main(args, **unused_kwargs): assert args['eval']['path'] is not None, '--path required for evaluation!' if torch.cuda.is_available() and not args['common']['cpu']: torch.cuda.set_device(args['distributed_training']['device_id']) LOGGER.info(args) # while evaluation, set fraction_using_func_name = 0, namely, not sample from func_name args['task']['fraction_using_func_name'] = 0. use_cuda = torch.cuda.is_available() and not args['common']['cpu'] if use_cuda: device = os.environ.get('CUDA_VISIBALE_DEVICES', [0])[0] # get first device as default torch.cuda.set_device(f'cuda:{device}') task = tasks.setup_task(args) # Load ensemble LOGGER.info('loading model(s) from {}'.format(args['eval']['path'])) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args['eval']['path']), arg_overrides=eval(args['eval']['model_overrides']), task=task, ) for lang in deepcopy(args['dataset']['langs']): args['dataset']['langs'] = [lang] # Load dataset splits LOGGER.info(f'Evaluating {lang} dataset') task.load_dataset(args['dataset']['gen_subset']) dataset = task.dataset(args['dataset']['gen_subset']) # Optimize ensemble for generation and set the source and dest dicts on the model (required by scorer) for model in models: model.make_generation_fast_() if args['common']['fp16']: model.half() if use_cuda: model.cuda() assert len(models) > 0 LOGGER.info('num. model params: {}'.format( sum(p.numel() for p in models[0].parameters()))) itr = task.get_batch_iterator( dataset=dataset, max_tokens=args['dataset']['max_tokens'] or 36000, max_sentences=args['eval']['max_sentences'], max_positions=utils.resolve_max_positions( *[model.max_positions() for model in models]), ignore_invalid_inputs=True, num_shards=args['dataset']['num_shards'], shard_id=args['dataset']['shard_id'], num_workers=args['dataset']['num_workers'], ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args['common']['log_format'], log_interval=args['common']['log_interval'], default_log_format=('tqdm' if not args['common']['no_progress_bar'] else 'none'), ) code_reprs, query_reprs = [], [] for sample in progress: if 'net_input' not in sample: continue sample = move_to_cuda(sample) if use_cuda else sample batch_code_reprs, batch_query_reprs = models[0]( **sample['net_input']) if use_cuda: batch_code_reprs = batch_code_reprs.cpu().detach() batch_query_reprs = batch_query_reprs.cpu().detach() code_reprs.append(batch_code_reprs) query_reprs.append(batch_query_reprs) code_reprs = torch.cat(code_reprs, dim=0) query_reprs = torch.cat(query_reprs, dim=0) assert code_reprs.shape == query_reprs.shape, (code_reprs.shape, query_reprs.shape) eval_size = len( code_reprs ) if args['eval']['eval_size'] == -1 else args['eval']['eval_size'] k, MRR, topk_idx, topk_prob = 3, [], [], [] for idx in range(len(dataset) // eval_size): code_emb = code_reprs[idx:idx + eval_size, :] query_emb = query_reprs[idx:idx + eval_size, :] if use_cuda: code_emb = code_emb.cuda() query_emb = query_emb.cuda() if args['criterion'] == 'search_cosine': src_emb_nrom = torch.norm(code_emb, dim=-1, keepdim=True) + 1e-10 tgt_emb_nrom = torch.norm(query_emb, dim=-1, keepdim=True) + 1e-10 logits = (query_emb / tgt_emb_nrom) @ (code_emb / src_emb_nrom).t() elif args['criterion'] == 'search_softmax': logits = query_emb @ code_emb.t() else: raise NotImplementedError correct_scores = logits.diag() compared_scores = logits >= correct_scores.unsqueeze(dim=-1) mrr = 1 / compared_scores.sum(dim=-1).float() MRR.extend(mrr.tolist()) if len(dataset) % eval_size: code_emb = code_reprs[-eval_size:, :] query_emb = query_reprs[-eval_size:, :] if use_cuda: code_emb = code_emb.cuda() query_emb = query_emb.cuda() if args['criterion'] == 'search_cosine': src_emb_nrom = torch.norm(code_emb, dim=-1, keepdim=True) + 1e-10 tgt_emb_nrom = torch.norm(query_emb, dim=-1, keepdim=True) + 1e-10 logits = (query_emb / tgt_emb_nrom) @ (code_emb / src_emb_nrom).t() elif args['criterion'] == 'search_softmax': logits = query_emb @ code_emb.t() else: raise NotImplementedError correct_scores = logits.diag() compared_scores = logits >= correct_scores.unsqueeze(dim=-1) last_ids = len(code_reprs) % eval_size mrr = 1 / compared_scores.sum(dim=-1).float()[-last_ids:] MRR.extend(mrr.tolist()) print('{}, mrr: {:.4f}'.format(lang, np.mean(MRR)))
def _main(args, output_file): if args['dataset']['max_tokens'] is None and args['dataset'][ 'max_sentences'] is None: args['dataset']['max_tokens'] = 12000 LOGGER.info(args) use_cuda = torch.cuda.is_available() and not args['common']['cpu'] # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args['dataset']['gen_subset']) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble LOGGER.info('loading model(s) from {}'.format(args['eval']['path'])) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args['eval']['path']), arg_overrides=eval(args['eval']['model_overrides']), task=task, ) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args['eval']['no_beamable_mm'] else args['eval']['beam'], need_attn=args['eval']['print_alignment'], ) if _model_args['common']['fp16']: model.half() if use_cuda: model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args['eval']['replace_unk']) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args['dataset']['gen_subset']), max_tokens=args['dataset']['max_tokens'], max_sentences=args['eval']['max_sentences'], max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=_model_args['dataset'] ['skip_invalid_size_inputs_valid_test'], required_batch_size_multiple=_model_args['dataset'] ['required_batch_size_multiple'], num_shards=_model_args['dataset']['num_shards'], shard_id=_model_args['dataset']['shard_id'], num_workers=_model_args['dataset']['num_workers'], ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=_model_args['common']['log_format'], log_interval=_model_args['common']['log_interval'], default_log_format=('tqdm' if not _model_args['common']['no_progress_bar'] else 'none'), ) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) # Generate and compute BLEU score scorer = OrderedDict() if args['eval']['sacrebleu']: scorer['bleu'] = bleu_scorer.SacrebleuScorer() elif args['eval']['nltk_bleu']: scorer['bleu'] = bleu_scorer.NLTKBleuScorer() else: scorer['bleu'] = bleu_scorer.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) # Generate and compute BLEU score if args['eval']['rouge']: scorer['rouge'] = rouge_scorer.RougeScorer() num_sentences = 0 has_target = True wps_meter = TimeMeter() # for sample in tqdm(progress, total=len(progress)): for sample in progress: torch.cuda.empty_cache() sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args['eval']['prefix_size'] > 0: prefix_tokens = sample['target'][:, :args['eval']['prefix_size']] gen_timer.start() hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad(sample['target'][i, :], tgt_dict.pad()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args['dataset']['gen_subset']).src.get_original_text( sample_id) target_str = task.dataset( args['dataset']['gen_subset']).tgt.get_original_text( sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args['eval']['remove_bpe']) else: src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args['eval']['remove_bpe'], escape_unk=True) if not args['eval']['quiet']: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str), file=output_file) if has_target: print('T-{}\t{}'.format(sample_id, target_str), file=output_file) # Process top predictions for j, hypo in enumerate(hypos[i][:args['eval']['nbest']]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args['eval']['remove_bpe'], ) if hypo_str == '.': # rouge cannot handle hypo'.' continue if not args['eval']['quiet']: score = hypo['score'] / math.log(2) # convert to base 2 print('H-{}\t{}\t{}'.format(sample_id, score, hypo_str), file=output_file) print( 'P-{}\t{}'.format( sample_id, ' '.join( map( lambda x: '{:.4f}'.format(x), # convert from base e to base 2 hypo['positional_scores'].div_(math.log(2) ).tolist(), ))), file=output_file) if args['eval']['print_alignment']: print('A-{}\t{}'.format( sample_id, ' '.join([ '{}-{}'.format(src_idx, tgt_idx) for src_idx, tgt_idx in alignment ])), file=output_file) if args['eval']['print_step']: print('I-{}\t{}'.format(sample_id, hypo['steps']), file=output_file) # if getattr(args, 'retain_iter_history', False): if args['eval']['retain_iter_history']: for step, h in enumerate(hypo['history']): _, h_str, _ = utils.post_process_prediction( hypo_tokens=h['tokens'].int().cpu(), src_str=src_str, alignment=None, align_dict=None, tgt_dict=tgt_dict, remove_bpe=None, ) print('E-{}_{}\t{}'.format(sample_id, step, h_str), file=output_file) # Score only the top hypothesis if has_target and j == 0: # print('Ref>> {}'.format(target_str), file=output_file) # print('Hyp>> {}'.format(hypo_str), file=output_file) if align_dict is not None or args['eval'][ 'remove_bpe'] is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True) for metric in scorer: if hasattr(scorer[metric], 'add_string'): scorer[metric].add_string(target_str, hypo_str) else: scorer[metric].add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) progress.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] LOGGER.info('NOTE: hypothesis and token scores are output in base 2') LOGGER.info( 'Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: LOGGER.info('Generate {} with beam={}: {}'.format( args['dataset']['gen_subset'], args['eval']['beam'], { '\n{}:\n{}'.format(str.upper(metric), value.score()) for metric, value in scorer.items() })) return scorer
def _main(args, output_file): if args['dataset']['max_tokens'] is None and args['dataset'][ 'max_sentences'] is None: args['dataset']['max_tokens'] = 12000 use_cuda = torch.cuda.is_available() and not args['common']['cpu'] if use_cuda: device = os.environ.get('CUDA_VISIBALE_DEVICES', [0])[0] # get first device as default torch.cuda.set_device(f'cuda:{device}') # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args['dataset']['gen_subset'], shuffle=False) # Load ensemble LOGGER.info('loading model(s) from {}'.format(args['eval']['path'])) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args['eval']['path']), arg_overrides=eval(args['eval']['model_overrides']), task=task, ) # Optimize ensemble for generation for model in models: if _model_args['common']['fp16']: model.half() if use_cuda: model.cuda() # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args['dataset']['gen_subset']), max_tokens=args['dataset']['max_tokens'], max_sentences=args['eval']['max_sentences_eval'], max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=_model_args['dataset'] ['skip_invalid_size_inputs_valid_test'], required_batch_size_multiple=_model_args['dataset'] ['required_batch_size_multiple'], num_shards=_model_args['dataset']['num_shards'], shard_id=_model_args['dataset']['shard_id'], num_workers=_model_args['dataset']['num_workers'], ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=_model_args['common']['log_format'], log_interval=_model_args['common']['log_interval'], default_log_format=('tqdm' if not _model_args['common']['no_progress_bar'] else 'none'), ) sequence_completor = task.build_completor([model], args) accuracy = {'all': 0.} mrr = {'all': 0.} sample_num = {'all': 0.} if task.dataset('test').attrs is not None: for attr in task.dataset('test').attrs: accuracy[attr] = 0. mrr[attr] = 0. sample_num[attr] = 0 def _eval(lprobs, target, idx, num): with torch.no_grad(): lprobs = lprobs[idx] target = target[idx] accuracy = (torch.argmax(lprobs, dim=-1) == target).sum().float().item() # Ref: Code Prediction by Feeding Trees to Transformers # With this practical perspective and for ease of computation, we only consider ranki ≤ 10 for each # location i (all ranki > 10 will have a score of 0). ranks = (lprobs >= lprobs[:, target].diag().unsqueeze(dim=-1)).sum(-1) mrr = 1. / ranks mrr[ranks > 10] = 0. mrr = mrr.sum().float().item() return accuracy, mrr, num for sample in progress: torch.cuda.empty_cache() sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue with torch.no_grad(): net_output = sequence_completor.generate([model], sample, prefix_tokens=None) # lprobs = model.get_normalized_probs(net_output, log_probs=True) lprobs = torch.softmax(net_output[0], dim=-1) lprobs = lprobs.view(-1, lprobs.size(-1)) target = model.get_targets(sample, net_output).view(-1) # all # ignore pad and unk idx = sample['net_input']['src_tokens'].view( -1) != task.target_dictionary.pad() idx[sample['target'].view(-1) == task.target_dictionary.unk()] = 0 # ignore overlapping tokens max_len = sample['target'].size(-1) for i, ext_i in enumerate(sample['extends']): idx[i * max_len:i * max_len + ext_i] = 0 batch_acc, batch_mrr, batch_num = _eval(lprobs, target, idx, num=idx.sum().item()) accuracy['all'] += batch_acc mrr['all'] += batch_mrr sample_num['all'] += batch_num # other attrs if sample['attr_masks'] is not None: for attr, attr_idx in sample['attr_masks'].items(): # pick out attr_idx who are not unk/pad attr_idx = attr_idx[idx[attr_idx].tolist()] if len(attr_idx) > 0: batch_acc, batch_mrr, batch_num = _eval( lprobs, target, attr_idx, num=attr_idx.size) accuracy[attr] += batch_acc mrr[attr] += batch_mrr sample_num[attr] += batch_num for attr in accuracy.keys(): avg_acc = round(accuracy[attr] / sample_num[attr], 6) if sample_num[attr] > 0. else None avg_mrr = round(mrr[attr] / sample_num[attr], 6) if sample_num[attr] > 0. else None print('[{}] tokens, accuracy: {}, MRR: {}'.format( attr, avg_acc, avg_mrr))
def validate(args, trainer, task, epoch_itr, valid_subsets, dev_subsets, dev_refs): """Evaluate the model on the validation set(s) and return the losses.""" if args['dataset']['fixed_validation_seed'] is not None: # set fixed seed for every validation utils.set_torch_seed(args['dataset']['fixed_validation_seed']) for subset in valid_subsets: # Initialize data iterator itr = task.get_batch_iterator( dataset=task.dataset(subset), max_tokens=args['dataset']['max_tokens_valid'], max_sentences=args['dataset']['max_sentences_valid'], max_positions=utils.resolve_max_positions( task.max_positions(), trainer.get_model().max_positions(), ), ignore_invalid_inputs=args['dataset'] ['skip_invalid_size_inputs_valid_test'], required_batch_size_multiple=args['dataset'] ['required_batch_size_multiple'], seed=args['common']['seed'], num_shards=args['distributed_training']['distributed_world_size'], shard_id=args['distributed_training']['distributed_rank'], num_workers=args['dataset']['num_workers'], ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args['common']['log_format'], log_interval=args['common']['log_interval'], epoch=epoch_itr.epoch, prefix=f"valid on '{subset}' subset", tensorboard_logdir=(args['common']['tensorboard_logdir'] if distributed_utils.is_master(args) else None), default_log_format=('tqdm' if not args['common']['no_progress_bar'] else 'simple'), ) # create a new root metrics aggregator so validation metrics # don't pollute other aggregators (e.g., train meters) with metrics.aggregate(new_root=True) as agg: for sample in progress: trainer.valid_step(sample) # log validation stats stats = get_valid_stats(args, trainer, agg.get_smoothed_values()) # calculate accuracy match = stats.pop('match') total = stats.pop('total') valid_acc = match / total progress.print( { 'accuracy': f'{round(100. * valid_acc, 2)}%', 'bleu': stats['bleu'], 'loss': stats['loss'], }, tag=subset, step=trainer.get_num_updates()) # for subset in dev_subsets: # hypotheses, references = {}, dev_refs # # # Initialize data iterator # itr = task.get_batch_iterator( # dataset=task.dataset(subset), # max_tokens=args['dataset']['max_tokens_valid'], # max_sentences=args['dataset']['max_sentences_valid'], # max_positions=utils.resolve_max_positions( # task.max_positions(), # trainer.get_model().max_positions(), # ), # ignore_invalid_inputs=args['dataset']['skip_invalid_size_inputs_valid_test'], # required_batch_size_multiple=args['dataset']['required_batch_size_multiple'], # seed=args['common']['seed'], # num_shards=args['distributed_training']['distributed_world_size'], # shard_id=args['distributed_training']['distributed_rank'], # num_workers=args['dataset']['num_workers'], # ).next_epoch_itr(shuffle=False) # progress = progress_bar.progress_bar( # itr, # log_format=args['common']['log_format'], # log_interval=args['common']['log_interval'], # epoch=epoch_itr.epoch, # prefix=f"valid on '{subset}' subset", # tensorboard_logdir=( # args['common']['tensorboard_logdir'] if distributed_utils.is_master(args) else None # ), # default_log_format=('tqdm' if not args['common']['no_progress_bar'] else 'simple'), # ) # # # create a new root metrics aggregator so validation metrics # # don't pollute other aggregators (e.g., train meters) # with metrics.aggregate(new_root=True) as agg: # for sample in progress: # with torch.no_grad(): # trainer.model.eval() # trainer.criterion.eval() # sample = trainer._prepare_sample(sample) # hyps, _, _, ids = trainer.task.step_out(sample, trainer.model) # for idx, hypo in zip(ids, hyps): # hypotheses[idx] = hypo # # from third_party.pycocoevalcap.bleu.google_bleu import compute_bleu # assert set(hypotheses.keys()) == set(references.keys()) # bleus = [ # compute_bleu([references[idx]], [hypotheses[idx]], smooth=Trainer)[0] # for idx in hypotheses.keys() # ] # dev_bleu = round(100. * sum(bleus) / len(bleus), 2) # # log validation stats # stats = agg.get_smoothed_values() # stats['bleu'] = dev_bleu # stats = get_dev_stats(args, trainer, stats) # progress.print(stats, tag=subset, step=trainer.get_num_updates()) # return valid_acc, dev_bleu return valid_acc, None