def _prepare_sample(self, sample, is_dummy=False): if sample == "DUMMY": raise Exception( "Trying to use an uninitialized 'dummy' batch. This usually indicates " "that the total number of batches is smaller than the number of " "participating GPUs. Try reducing the batch size or using fewer GPUs." ) if sample is None or len(sample) == 0: assert (self._dummy_batch is not None and len(self._dummy_batch) > 0 ), "Invalid dummy batch: {}".format(self._dummy_batch) sample, _ = self._prepare_sample(self._dummy_batch, is_dummy=True) return sample, True if self.args['common'].get('on_cpu_convert_precision', False): sample = self._fp_convert_sample(sample) if self.cuda: sample = move_to_cuda(sample) if not self.args['common'].get('on_cpu_convert_precision', False): sample = self._fp_convert_sample(sample) if self._dummy_batch == "DUMMY": self._dummy_batch = sample return sample, False
def train_step(self, samples, raise_oom=False): """Do forward, backward and parameter update.""" # self._set_seed() # seed = self.args['common']['seed'] + self.get_num_updates() # torch.manual_seed(seed) # if self.cuda: # torch.cuda.manual_seed(seed) # self.model.train() # self.criterion.train() # self.zero_grad() # forward and backward pass # logging_outputs, sample_size, ooms = [], 0, 0 for i, sample in enumerate(samples): if self.cuda: sample = utils.move_to_cuda(sample) loss, sample_size_i, logging_output = self.task.train_step( sample=sample, model=self.model, criterion=self.criterion, optimizer=self.optimizer, update_num=self.get_num_updates(), ignore_grad=False, ) loss.backward() self.optimizer.step() self.optimizer.zero_grad() return loss.item()
def get_prev_item(prev_model, task_idx, index): src_item = self.prev_tgts[task_idx][index][:-1] tgt_item = self.prev_tgts[task_idx][index][1:] model.load_state_dict(prev_model) with torch.no_grad(): tmp_src = torch.cat([src_item, torch.LongTensor([self.pad] * (MAX_SENTENCE_LENGTH - len(src_item)))], dim=-1) tmp_src = move_to_cuda(tmp_src[None, ...]) hidden_reprs = model.extract_features(tmp_src).squeeze(dim=0) extend = 0 if self.extends is None else self.extends[index].item() if self.attrs_mapping: # do not move attr_masks into cuda attr_masks = {attr: [] for attr in self.attrs} for idx, attr_idx in enumerate(self.attr_indices[index].tolist()[1:][extend:], start=extend): if attr_idx in self.reversed_attrs_mapping: attr_masks[self.reversed_attrs_mapping[attr_idx]].append(idx) for attr in self.attrs: attr_masks[attr] = np.array(attr_masks[attr]) else: attr_masks = None example = { 'id': index, 'source': src_item, 'target': tgt_item, 'hidden_reprs': hidden_reprs, 'attr_masks': attr_masks, 'extend': extend, } return example
def gen_outputs(args, task, trainer): # cause some data mighe been filtered by max_source/target_position tmp_cache = [ [8 * [0] for _ in range(6)], # topk idx [8 * [0] for _ in range(6)], # topk prob ] trainer.model.eval() itr = task.get_batch_iterator( dataset=task.dataset('train'), max_tokens=args['dataset']['max_tokens'], max_sentences=args['dataset']['max_sentences_valid'], # max_sentences=16, max_positions=utils.resolve_max_positions( task.max_positions(), trainer.get_model().max_positions(), ), ignore_invalid_inputs=args['dataset'] ['skip_invalid_size_inputs_valid_test'], # required_batch_size_multiple=8, seed=args['common']['seed'], num_shards=args['distributed_training']['distributed_world_size'], shard_id=args['distributed_training']['distributed_rank'], ).next_epoch_itr(shuffle=False) # outputs = [None for _ in range(len(task.dataset('train')))] outputs = {} for sample in tqdm(itr, mininterval=5): with torch.no_grad(): if sample is None or len(sample) == 0: continue sample = utils.move_to_cuda(sample) bs, srclen = sample['net_input']['src_tokens'].shape output = trainer.model(**sample['net_input'])[0].detach() non_padding_mask = sample['target'].ne( task.target_dictionary.pad()).cpu() _, tgtlen = sample['target'].shape topk_idx, topk_v = output2topk(output, args['kd']['distill_topk']) topk_x_shape = (bs, tgtlen, args['kd']['distill_topk']) topk_idx, topk_v = topk_idx.view( *topk_x_shape).cpu().numpy(), topk_v.view( *topk_x_shape).cpu().numpy() non_padding_mask = non_padding_mask.view( *topk_x_shape[:2]).cpu().numpy().astype(bool) for b in range(bs): outputs[sample['id'][b].item()] = \ topk_idx[b, non_padding_mask[b]].tolist(), \ topk_v[b, non_padding_mask[b]].tolist() return [ outputs[idx] if idx in outputs else tmp_cache for idx in list(range(len(task.dataset('train')))) ]
def valid_step(self, sample, raise_oom=False): """Do forward pass in evaluation mode.""" with torch.no_grad(): self.model.eval() self.criterion.eval() if self.cuda: sample = utils.move_to_cuda(sample) # _loss, sample_size, logging_output predictions, loss, sample_size, logging_output = self.task.valid_step( sample, self.model, self.criterion) return predictions, loss, sample_size, logging_output
def main(model_path, input): args, task, model, use_cuda = load_state(model_path) generator = task.build_generator(args) # encode input (and feed into gpu) input = task.encode_input(input) if use_cuda: input = utils.move_to_cuda(input) # feed input into model output = generator.generate(models=[model], sample=input) # decode # from ipdb import set_trace # set_trace() output = task.decode_output(output) del task, model # to release memory in cpu/gpu return output
def summarization_task(args, task, model, use_cuda, input, **kwargs): from ncc.tokenizers import tokenization generator = task.build_generator([model], args) # encode input (and feed into gpu) # input = task.encode_input(input, tokenizer=tokenization._space_dpu_sub_tokenizer) input = task.encode_input(input, tokenizer=tokenization._space_dpu_sub_tokenizer) if use_cuda: input = utils.move_to_cuda(input) # feed input into model output = generator.generate(models=[model], sample=input) # decode # from ipdb import set_trace # set_trace() output = task.decode_output(output) del task, model # to release memory in cpu/gpu return output
def completion_task(args, task, model, use_cuda, input, **kwargs): generator = task.build_generator([model], args) # encode input (and feed into gpu) input = task.encode_input(input, tokenizer=tokenization._space_tokenizer) if use_cuda: input = utils.move_to_cuda(input) # feed input into model output = generator.generate(models=[model], sample=input) # decode # from ipdb import set_trace # set_trace() output = task.decode_output(output) del task, model # to release memory in cpu/gpu top_tokens, probabilities = zip(*output) return { 'top_tokens': top_tokens, 'probabilities': probabilities, }
def main(model_path, input): LOGGER.info('Load model from {}'.format(model_path)) state = load_checkpoint_to_cpu(model_path, arg_overrides={}) args = state["args"] task = tasks.setup_task(args) # load src/tgt dicts model = task.build_model(args) model.load_state_dict(state["model"]) use_cuda = torch.cuda.is_available() and not args['common']['cpu'] use_cuda = 0 if use_cuda: torch.cuda.empty_cache() torch.cuda.set_device(torch.cuda.device_count() - 1) model.cuda() model.eval() if args['common']['fp16'] and use_cuda: model.half() # TODO: source tensor should be handled in corresponding task scripts. here we only use seq2seq pipeline for instance. intput_ids = task.target_dictionary.encode_string(input, line_tokenizer=None, add_if_not_exist=False) src_input_ids = intput_ids.long().unsqueeze(dim=0) sample = { 'net_input': { 'src_tokens': src_input_ids, }, } sample = utils.move_to_cuda(sample) if use_cuda else sample generator = task.sequence_completor net_output = generator.complete(models=[model], sample=sample) # from ipdb import set_trace # set_trace() pred_prob = torch.softmax(net_output[0][0, -1, :], dim=-1) topk_prob, topk_idx = pred_prob.topk(k=10, dim=-1) # remove unk/eos/bos/pad topk_info = [(round(prob.item(), 6), idx.item()) for prob, idx in zip(topk_prob, topk_idx)][:5] topk_info = [(task.target_dictionary[idx], prob) for prob, idx in topk_info] pred_sentence = [ (input[:-1] + [topk_token], topk_prob) for topk_token, topk_prob in topk_info ] return topk_info, pred_sentence
def main(model_path, input): state = load_checkpoint_to_cpu(model_path, arg_overrides={}) args = state["args"] task = tasks.setup_task(args) # load src/tgt dicts model = task.build_model(args) model.load_state_dict(state["model"]) use_cuda = torch.cuda.is_available() and not args['common']['cpu'] if use_cuda: torch.cuda.empty_cache() torch.cuda.set_device(torch.cuda.device_count() - 1) model.cuda() if args['common']['fp16'] and use_cuda: model.half() model.eval() # TODO: source tensor should be handled in corresponding task scripts. here we only use seq2seq pipeline for instance. src_input_ids = task.src_dict.encode_line(input, line_tokenizer=None, add_if_not_exist=False) src_input_ids = torch.cat([ src_input_ids[:args['task']['max_source_positions'] - 1], torch.Tensor([task.src_dict.eos()]).long() ]) padding_size = args['task']['max_source_positions'] - len(src_input_ids) if padding_size > 0: src_input_ids = torch.cat([ src_input_ids, torch.Tensor([task.src_dict.pad()] * padding_size).long() ]) if use_cuda: src_input_ids = src_input_ids.unsqueeze(dim=0).cuda() sample = { 'net_input': { 'src_tokens': src_input_ids, 'src_lengths': torch.LongTensor([s.numel() for s in src_input_ids]), }, } sample = utils.move_to_cuda(sample) if use_cuda else sample generator = task.build_generator(args) pred_sentence_ids = generator.generate(models=[model], sample=sample) pred_sentence = task.tgt_dict.string(pred_sentence_ids[0][0]['tokens']) return pred_sentence
def _prepare_sample(self, sample): if sample == "DUMMY": raise Exception( "Trying to use an uninitialized 'dummy' batch. This usually indicates " "that the total number of batches is smaller than the number of " "participating GPUs. Try reducing the batch size or using fewer GPUs." ) if sample is None or len(sample) == 0: return None if self.cuda: sample = utils.move_to_cuda(sample) def apply_half(t): if t.dtype is torch.float32: return t.half() return t if self.args['common']['fp16']: sample = utils.apply_to_sample(apply_half, sample) return sample
def main(model_path, input): LOGGER.info('Load model from {}'.format(model_path)) state = load_checkpoint_to_cpu(model_path, arg_overrides={}) args = state["args"] args = recursive_contractuser(args, old_cache_name='.ncc') args = recursive_expanduser(args) task = tasks.setup_task(args) # load src/tgt dicts model = task.build_model(args) model.load_state_dict(state["model"]) use_cuda = torch.cuda.is_available() and not args['common']['cpu'] if use_cuda: torch.cuda.empty_cache() torch.cuda.set_device(torch.cuda.device_count() - 1) model.cuda() model.eval() if args['common']['fp16'] and use_cuda: model.half() sample = task.encode_input(input) sample = utils.move_to_cuda(sample) if use_cuda else sample generator = task.sequence_completor net_output = generator.complete(models=[model], sample=sample) out = task.decode_output(net_output) return out
def main(args, out_file=None): use_cuda = torch.cuda.is_available() and not args['common']['cpu'] # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args['dataset']['gen_subset']) # Set dictionaries src_dict = task.source_dictionary tgt_dict = task.target_dictionary # Load ensemble LOGGER.info('loading model(s) from {}'.format(args['eval']['path'])) models, _ = checkpoint_utils.load_model_ensemble( utils.split_paths(args['eval']['path']), arg_overrides=eval(args['eval']['model_overrides']), task=task, ) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args['eval']['no_beamable_mm'] else args['eval']['beam'], need_attn=args['eval']['print_alignment'], ) if use_cuda: device = os.environ.get('CUDA_VISIBALE_DEVICES', [0])[0] # get first device as default torch.cuda.set_device(f'cuda:{device}') model = model.cuda() if args['common']['fp16'] and use_cuda: model.half() # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args['dataset']['gen_subset']), max_tokens=args['dataset']['max_tokens'], max_sentences=args['eval']['max_sentences'], max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args['dataset'] ['skip_invalid_size_inputs_valid_test'], required_batch_size_multiple=args['dataset'] ['required_batch_size_multiple'], num_shards=args['dataset']['num_shards'], shard_id=args['dataset']['shard_id'], num_workers=args['dataset']['num_workers'], ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args['common']['log_format'], log_interval=args['common']['log_interval'], default_log_format=('tqdm' if not args['common']['no_progress_bar'] else 'none'), ) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(models, args) sources, hypotheses, references = dict(), dict(), dict() for sample in progress: torch.cuda.empty_cache() sample = move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue gen_timer.start() hypos = task.inference_step(generator, models, sample, bos_token=tgt_dict.bos()) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) # TODO: warning gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad(sample['target'][i, :], tgt_dict.pad()).int().cpu() hypos_tokens = utils.strip_eos(hypos[i][0]['tokens'], tgt_dict.eos()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if src_dict is not None: src_str = src_dict.string(src_tokens, args['eval']['remove_bpe']) else: src_str = "0" if has_target: target_str = tgt_dict.string(target_tokens, args['eval']['remove_bpe'], escape_unk=True) hypo_str = tgt_dict.string(hypos_tokens, args['eval']['remove_bpe']) sources[sample_id] = [src_str] hypotheses[sample_id] = [hypo_str] references[sample_id] = [target_str] bleu, rouge_l, meteor = \ summarization_metrics.eval_accuracies(hypotheses, references, filename=out_file, mode='test') LOGGER.info('BLEU: {:.2f}\t ROUGE-L: {:.2f}\t METEOR: {:.2f}'.format( bleu, rouge_l, meteor))
def _main(args, output_file): if args['dataset']['max_tokens'] is None and args['dataset'][ 'max_sentences'] is None: args['dataset']['max_tokens'] = 12000 LOGGER.info(args) use_cuda = torch.cuda.is_available() and not args['common']['cpu'] # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args['dataset']['gen_subset']) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble LOGGER.info('loading model(s) from {}'.format(args['eval']['path'])) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args['eval']['path']), arg_overrides=eval(args['eval']['model_overrides']), task=task, ) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args['eval']['no_beamable_mm'] else args['eval']['beam'], need_attn=args['eval']['print_alignment'], ) if _model_args['common']['fp16']: model.half() if use_cuda: model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args['eval']['replace_unk']) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args['dataset']['gen_subset']), max_tokens=args['dataset']['max_tokens'], max_sentences=args['eval']['max_sentences'], max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=_model_args['dataset'] ['skip_invalid_size_inputs_valid_test'], required_batch_size_multiple=_model_args['dataset'] ['required_batch_size_multiple'], num_shards=_model_args['dataset']['num_shards'], shard_id=_model_args['dataset']['shard_id'], num_workers=_model_args['dataset']['num_workers'], ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=_model_args['common']['log_format'], log_interval=_model_args['common']['log_interval'], default_log_format=('tqdm' if not _model_args['common']['no_progress_bar'] else 'none'), ) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) # Generate and compute BLEU score scorer = OrderedDict() if args['eval']['sacrebleu']: scorer['bleu'] = bleu_scorer.SacrebleuScorer() elif args['eval']['nltk_bleu']: scorer['bleu'] = bleu_scorer.NLTKBleuScorer() else: scorer['bleu'] = bleu_scorer.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) # Generate and compute BLEU score if args['eval']['rouge']: scorer['rouge'] = rouge_scorer.RougeScorer() num_sentences = 0 has_target = True wps_meter = TimeMeter() # for sample in tqdm(progress, total=len(progress)): for sample in progress: torch.cuda.empty_cache() sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args['eval']['prefix_size'] > 0: prefix_tokens = sample['target'][:, :args['eval']['prefix_size']] gen_timer.start() hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad(sample['target'][i, :], tgt_dict.pad()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args['dataset']['gen_subset']).src.get_original_text( sample_id) target_str = task.dataset( args['dataset']['gen_subset']).tgt.get_original_text( sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args['eval']['remove_bpe']) else: src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args['eval']['remove_bpe'], escape_unk=True) if not args['eval']['quiet']: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str), file=output_file) if has_target: print('T-{}\t{}'.format(sample_id, target_str), file=output_file) # Process top predictions for j, hypo in enumerate(hypos[i][:args['eval']['nbest']]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args['eval']['remove_bpe'], ) if hypo_str == '.': # rouge cannot handle hypo'.' continue if not args['eval']['quiet']: score = hypo['score'] / math.log(2) # convert to base 2 print('H-{}\t{}\t{}'.format(sample_id, score, hypo_str), file=output_file) print( 'P-{}\t{}'.format( sample_id, ' '.join( map( lambda x: '{:.4f}'.format(x), # convert from base e to base 2 hypo['positional_scores'].div_(math.log(2) ).tolist(), ))), file=output_file) if args['eval']['print_alignment']: print('A-{}\t{}'.format( sample_id, ' '.join([ '{}-{}'.format(src_idx, tgt_idx) for src_idx, tgt_idx in alignment ])), file=output_file) if args['eval']['print_step']: print('I-{}\t{}'.format(sample_id, hypo['steps']), file=output_file) # if getattr(args, 'retain_iter_history', False): if args['eval']['retain_iter_history']: for step, h in enumerate(hypo['history']): _, h_str, _ = utils.post_process_prediction( hypo_tokens=h['tokens'].int().cpu(), src_str=src_str, alignment=None, align_dict=None, tgt_dict=tgt_dict, remove_bpe=None, ) print('E-{}_{}\t{}'.format(sample_id, step, h_str), file=output_file) # Score only the top hypothesis if has_target and j == 0: # print('Ref>> {}'.format(target_str), file=output_file) # print('Hyp>> {}'.format(hypo_str), file=output_file) if align_dict is not None or args['eval'][ 'remove_bpe'] is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True) for metric in scorer: if hasattr(scorer[metric], 'add_string'): scorer[metric].add_string(target_str, hypo_str) else: scorer[metric].add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) progress.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] LOGGER.info('NOTE: hypothesis and token scores are output in base 2') LOGGER.info( 'Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: LOGGER.info('Generate {} with beam={}: {}'.format( args['dataset']['gen_subset'], args['eval']['beam'], { '\n{}:\n{}'.format(str.upper(metric), value.score()) for metric, value in scorer.items() })) return scorer
def _main(args, output_file): if args['dataset']['max_tokens'] is None and args['dataset']['max_sentences'] is None: args['dataset']['max_tokens'] = 12000 LOGGER.info(args) use_cuda = torch.cuda.is_available() and not args['common']['cpu'] # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args['dataset']['gen_subset']) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble LOGGER.info('loading model(s) from {}'.format(args['eval']['path'])) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args['eval']['path']), arg_overrides=eval(args['eval']['model_overrides']), task=task, ) # Optimize ensemble for generation for model in models: if _model_args['common']['fp16']: model.half() if use_cuda: model.cuda() # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args['dataset']['gen_subset']), max_tokens=args['dataset']['max_tokens'], max_sentences=args['dataset']['max_sentences'], max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models] ), ignore_invalid_inputs=_model_args['dataset']['skip_invalid_size_inputs_valid_test'], required_batch_size_multiple=_model_args['dataset']['required_batch_size_multiple'], num_shards=_model_args['dataset']['num_shards'], shard_id=_model_args['dataset']['shard_id'], num_workers=_model_args['dataset']['num_workers'], ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=_model_args['common']['log_format'], log_interval=_model_args['common']['log_interval'], default_log_format=('tqdm' if not _model_args['common']['no_progress_bar'] else 'none'), ) """ nohup python -m run.completion.seqrnn.eval > run/completion/seqrnn/case.log 2>&1 & """ sequence_completor = task.build_completor([model], args) for sample in progress: torch.cuda.empty_cache() sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue non_pad_idx = sample['net_input']['src_tokens'] > task.target_dictionary.pad() with torch.no_grad(): net_output = sequence_completor.generate([model], sample, prefix_tokens=None) lprobs = model.get_normalized_probs(net_output, log_probs=True) # from ipdb import set_trace # set_trace() rank = torch.argmax(lprobs, dim=-1) target = model.get_targets(sample, net_output) accuracy = 1.0 * ((rank == target) & non_pad_idx).sum(dim=-1) / non_pad_idx.sum(dim=-1) for idx, (data_idx, acc) in enumerate(zip(sample['id'], accuracy)): if acc > 0.9: LOGGER.info(f"{data_idx}: {task.target_dictionary.string(sample['net_input']['src_tokens'][idx, :])}")
def _main(args, output_file): if args['dataset']['max_tokens'] is None and args['dataset'][ 'max_sentences'] is None: args['dataset']['max_tokens'] = 12000 LOGGER.info(args) use_cuda = torch.cuda.is_available() and not args['common']['cpu'] # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args['dataset']['gen_subset']) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble LOGGER.info('loading model(s) from {}'.format(args['eval']['path'])) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args['eval']['path']), arg_overrides=eval(args['eval']['model_overrides']), task=task, ) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args['eval']['no_beamable_mm'] else args['eval']['beam'], need_attn=args['eval']['print_alignment'], ) if _model_args['common']['fp16']: model.half() if use_cuda: model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args['eval']['replace_unk']) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args['dataset']['gen_subset']), max_tokens=args['dataset']['max_tokens'], max_sentences=args['eval']['max_sentences'], max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=_model_args['dataset'] ['skip_invalid_size_inputs_valid_test'], required_batch_size_multiple=_model_args['dataset'] ['required_batch_size_multiple'], num_shards=_model_args['dataset']['num_shards'], shard_id=_model_args['dataset']['shard_id'], num_workers=_model_args['dataset']['num_workers'], ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=_model_args['common']['log_format'], log_interval=_model_args['common']['log_interval'], default_log_format=('tqdm' if not _model_args['common']['no_progress_bar'] else 'none'), ) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) num_sentences = 0 has_target = True wps_meter = TimeMeter() # for sample in tqdm(progress, total=len(progress)): sources, hypotheses, references = dict(), dict(), dict() for sample in progress: torch.cuda.empty_cache() sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue # prefix_tokens = None # if args['eval']['prefix_size'] > 0: # prefix_tokens = sample['target'][:, :args['eval']['prefix_size']] gen_timer.start() hypos = task.inference_step(generator, models, sample) # gen_out = task.sequence_generator.generate(model, sample) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) # TODO: warning gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad(sample['target'][i, :], tgt_dict.pad()).int().cpu() hypos_tokens = utils.strip_eos(hypos[i][0]['tokens'], tgt_dict.eos()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. # if align_dict is not None: # src_str = task.dataset(args['dataset']['gen_subset']).src.get_original_text(sample_id) # target_str = task.dataset(args['dataset']['gen_subset']).tgt.get_original_text(sample_id) # else: if src_dict is not None: src_str = src_dict.string(src_tokens, args['eval']['remove_bpe']) else: src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args['eval']['remove_bpe'], escape_unk=True) # hypo_tokens = tgt_dict.encode_line(hypo_str, add_if_not_exist=True) hypo_str = tgt_dict.string(hypos_tokens, args['eval']['remove_bpe']) sources[sample_id] = [src_str] hypotheses[sample_id] = [hypo_str] references[sample_id] = [target_str] if not args['eval']['quiet']: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str), file=output_file) if has_target: print('T-{}\t{}'.format(sample_id, target_str), file=output_file) print('H-{}\t{}'.format(sample_id, hypo_str), file=output_file) filename = os.path.join(os.path.dirname(__file__), 'config', 'predict.json') LOGGER.info('write predicted file at {}'.format(filename)) bleu, rouge_l, meteor = eval_utils.eval_accuracies(hypotheses, references, filename=filename, mode='test') LOGGER.info('BLEU: {:.2f}\t ROUGE-L: {:.2f}\t METEOR: {:.2f}'.format( bleu, rouge_l, meteor))
def _main(args, output_file): if args['dataset']['max_tokens'] is None and args['dataset'][ 'max_sentences'] is None: args['dataset']['max_tokens'] = 12000 use_cuda = torch.cuda.is_available() and not args['common']['cpu'] if use_cuda: device = os.environ.get('CUDA_VISIBALE_DEVICES', [0])[0] # get first device as default torch.cuda.set_device(f'cuda:{device}') # Load dataset splits task = tasks.setup_task(args) # Load ensemble LOGGER.info('loading model(s) from {}'.format(args['eval']['path'])) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args['eval']['path']), arg_overrides=eval(args['eval']['model_overrides']), task=task, ) # Optimize ensemble for generation for model in models: if _model_args['common']['fp16']: model.half() if use_cuda: model.cuda() sequence_completor = task.build_completor(models, args) subsets = [ args['dataset']['train_subset'], args['dataset']['valid_subset'], args['dataset']['gen_subset'], ] for subset in subsets: task.load_dataset(subset, shuffle=False) task.dataset(subset).shuffle = False # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(subset), max_tokens=args['dataset']['max_tokens'], max_sentences=args['eval']['max_sentences_eval'], max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=_model_args['dataset'] ['skip_invalid_size_inputs_valid_test'], required_batch_size_multiple=_model_args['dataset'] ['required_batch_size_multiple'], num_shards=_model_args['dataset']['num_shards'], shard_id=_model_args['dataset']['shard_id'], num_workers=_model_args['dataset']['num_workers'], ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=_model_args['common']['log_format'], log_interval=_model_args['common']['log_interval'], default_log_format=('tqdm' if not _model_args['common']['no_progress_bar'] else 'none'), ) topk = args['kd']['gen_topk'] out_idx, out_prob = [], [] with torch.no_grad(): for sample in progress: torch.cuda.empty_cache() sample = move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue net_output = sequence_completor.generate([model], sample, prefix_tokens=None) topk_prob, topk_ids = torch.topk(net_output[0], topk, dim=-1) # ignore pad non_padding_mask = sample['net_input'][ 'src_tokens'] != task.target_dictionary.pad() if use_cuda: topk_prob, topk_ids = topk_prob.cpu(), topk_ids.cpu() non_padding_mask = non_padding_mask.cpu() for idx in range(topk_prob.size(0)): out_idx.append( topk_ids[idx, ...][non_padding_mask[idx, ...]].view(-1).tolist()) out_prob.append(topk_prob[idx, ...][non_padding_mask[ idx, ...]].view(-1).tolist()) assert len(out_idx) == len(out_prob) == len(task.dataset(subset)), \ Exception(len(out_idx), len(out_prob), len(task.dataset(subset))) TeacherOutDataset.save_bin( prefix=os.path.join(args['checkpoint']['save_dir'], f'{subset}.top{topk}_idx'), data_list=out_idx, dtype=np.int32, ) TeacherOutDataset.save_bin( prefix=os.path.join(args['checkpoint']['save_dir'], f'{subset}.top{topk}_prob'), data_list=out_prob, dtype=np.float, )
def main(args, **unused_kwargs): assert args['eval']['path'] is not None, '--path required for evaluation!' if torch.cuda.is_available() and not args['common']['cpu']: torch.cuda.set_device(args['distributed_training']['device_id']) LOGGER.info(args) # while evaluation, set fraction_using_func_name = 0, namely, not sample from func_name args['task']['fraction_using_func_name'] = 0. use_cuda = torch.cuda.is_available() and not args['common']['cpu'] if use_cuda: device = os.environ.get('CUDA_VISIBALE_DEVICES', [0])[0] # get first device as default torch.cuda.set_device(f'cuda:{device}') task = tasks.setup_task(args) # Load ensemble LOGGER.info('loading model(s) from {}'.format(args['eval']['path'])) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args['eval']['path']), arg_overrides=eval(args['eval']['model_overrides']), task=task, ) for lang in deepcopy(args['dataset']['langs']): args['dataset']['langs'] = [lang] # Load dataset splits LOGGER.info(f'Evaluating {lang} dataset') task.load_dataset(args['dataset']['gen_subset']) dataset = task.dataset(args['dataset']['gen_subset']) # Optimize ensemble for generation and set the source and dest dicts on the model (required by scorer) for model in models: model.make_generation_fast_() if args['common']['fp16']: model.half() if use_cuda: model.cuda() assert len(models) > 0 LOGGER.info('num. model params: {}'.format( sum(p.numel() for p in models[0].parameters()))) itr = task.get_batch_iterator( dataset=dataset, max_tokens=args['dataset']['max_tokens'] or 36000, max_sentences=args['eval']['max_sentences'], max_positions=utils.resolve_max_positions( *[model.max_positions() for model in models]), ignore_invalid_inputs=True, num_shards=args['dataset']['num_shards'], shard_id=args['dataset']['shard_id'], num_workers=args['dataset']['num_workers'], ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args['common']['log_format'], log_interval=args['common']['log_interval'], default_log_format=('tqdm' if not args['common']['no_progress_bar'] else 'none'), ) code_reprs, query_reprs = [], [] for sample in progress: if 'net_input' not in sample: continue sample = move_to_cuda(sample) if use_cuda else sample batch_code_reprs, batch_query_reprs = models[0]( **sample['net_input']) if use_cuda: batch_code_reprs = batch_code_reprs.cpu().detach() batch_query_reprs = batch_query_reprs.cpu().detach() code_reprs.append(batch_code_reprs) query_reprs.append(batch_query_reprs) code_reprs = torch.cat(code_reprs, dim=0) query_reprs = torch.cat(query_reprs, dim=0) assert code_reprs.shape == query_reprs.shape, (code_reprs.shape, query_reprs.shape) eval_size = len( code_reprs ) if args['eval']['eval_size'] == -1 else args['eval']['eval_size'] k, MRR, topk_idx, topk_prob = 3, [], [], [] for idx in range(len(dataset) // eval_size): code_emb = code_reprs[idx:idx + eval_size, :] query_emb = query_reprs[idx:idx + eval_size, :] if use_cuda: code_emb = code_emb.cuda() query_emb = query_emb.cuda() if args['criterion'] == 'search_cosine': src_emb_nrom = torch.norm(code_emb, dim=-1, keepdim=True) + 1e-10 tgt_emb_nrom = torch.norm(query_emb, dim=-1, keepdim=True) + 1e-10 logits = (query_emb / tgt_emb_nrom) @ (code_emb / src_emb_nrom).t() elif args['criterion'] == 'search_softmax': logits = query_emb @ code_emb.t() else: raise NotImplementedError correct_scores = logits.diag() compared_scores = logits >= correct_scores.unsqueeze(dim=-1) mrr = 1 / compared_scores.sum(dim=-1).float() MRR.extend(mrr.tolist()) if len(dataset) % eval_size: code_emb = code_reprs[-eval_size:, :] query_emb = query_reprs[-eval_size:, :] if use_cuda: code_emb = code_emb.cuda() query_emb = query_emb.cuda() if args['criterion'] == 'search_cosine': src_emb_nrom = torch.norm(code_emb, dim=-1, keepdim=True) + 1e-10 tgt_emb_nrom = torch.norm(query_emb, dim=-1, keepdim=True) + 1e-10 logits = (query_emb / tgt_emb_nrom) @ (code_emb / src_emb_nrom).t() elif args['criterion'] == 'search_softmax': logits = query_emb @ code_emb.t() else: raise NotImplementedError correct_scores = logits.diag() compared_scores = logits >= correct_scores.unsqueeze(dim=-1) last_ids = len(code_reprs) % eval_size mrr = 1 / compared_scores.sum(dim=-1).float()[-last_ids:] MRR.extend(mrr.tolist()) print('{}, mrr: {:.4f}'.format(lang, np.mean(MRR)))
def cli_main(): SEED = 204 BATCH_SIZE = 64 MAX_SOURCE_POSITIONS = 1024 EPOCH = 50 from ncc.utils.set_seed import set_seed set_seed(SEED) use_cuda = torch.cuda.is_available() if use_cuda: device = os.environ.get('CUDA_VISIBALE_DEVICES', [0])[0] # get first device as default torch.cuda.set_device(f'cuda:{device}') criterion = DeepTuneLoss(task=None, sentence_avg=-1) if use_cuda: criterion = criterion.cuda() data = [] for i, platform in enumerate(LANGUAGES): DATA_DIR = os.path.join(DATASET_DIR, f'mapping/{platform}/data-mmap') def get_attr(attr): oracle_file = os.path.join(DATA_DIR, f'train.{attr}') with open(oracle_file, 'rb') as reader: out = pickle.load(reader) return np.asarray(out) platform_name = mapping_metrics.platform2str(platform) benchmarks = get_attr('benchmark') runtime_cpus = get_attr('runtime_cpu') runtime_gpus = get_attr('runtime_gpu') #################### load dataset #################### src_dataset = load_mmap_dataset(os.path.join(DATA_DIR, f'train.src_tokens')) src_dataset = TruncateDataset(src_dataset, truncation_length=MAX_SOURCE_POSITIONS, truncate_prefix=0) tgt_dataset = load_mmap_dataset(os.path.join(DATA_DIR, f'train.oracle')) src_dict = Dictionary.load(os.path.join(DATA_DIR, 'src_tokens.dict.jsonl')) src_aux = OrderedDict() src_aux['transfer'] = get_attr('transfer') src_aux['wgsize'] = get_attr('wgsize') tgt_dict = Dictionary.load(os.path.join(DATA_DIR, 'oracle.dict.jsonl')) dataset = LanguagePairDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=src_dict, src_aux=src_aux, tgt=tgt_dataset, tgt_sizes=tgt_dataset.sizes, tgt_dict=tgt_dict, tgt_aux=None, left_pad_source=True, max_source_positions=MAX_SOURCE_POSITIONS, ) #################### load dataset #################### # build toy dataset for 10-fold cross validation tgt_data = [tgt_dataset[idx].item() for idx in range(len(tgt_dataset))] src_data = [None] * len(tgt_data) # 10-fold cross-validation kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED) for j, (train_ids, test_ids) in enumerate(kf.split(src_data, tgt_data)): # deeptune model model = DeepTuneEncoder(dictionary=src_dict, embed_dim=64, rnn_cell='lstm', rnn_hidden_dim=64, rnn_dropout=0., rnn_num_layers=2, aux_dim=2, inner_dim=32, out_dim=2) if use_cuda: model = model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) for epoch_i in range(EPOCH): if dataset.shuffle: random.shuffle(train_ids) train_batch_sampler = data_utils.batch_by_size( train_ids, num_tokens_fn=lambda *args: -1, max_sentences=BATCH_SIZE, ) train_dataloader = DataLoader(dataset=dataset, batch_sampler=train_batch_sampler, collate_fn=collate, ) with tqdm(total=len(train_dataloader)) as t: for sample_i, sample in enumerate(train_dataloader, start=1): t.set_description(f'Epoch {epoch_i + 1}/{EPOCH} Batch {sample_i}/{len(train_dataloader)}') if use_cuda: sample = move_to_cuda(sample) loss, sample_size, logging_output = criterion(model, sample) loss.div_(sample_size) t.set_postfix(loss=loss.item()) t.update() optimizer.zero_grad() loss.backward() optimizer.step() # test accuracy test_batch_sampler = data_utils.batch_by_size( test_ids, num_tokens_fn=lambda *args: -1, max_sentences=BATCH_SIZE, ) test_dataloader = DataLoader(dataset=dataset, batch_sampler=test_batch_sampler, collate_fn=collate, ) predictions, ground_truth = [], [] for sample in test_dataloader: if use_cuda: sample = move_to_cuda(sample) hybrid_out, _ = model(**sample['net_input']) predictions.append(hybrid_out.max(dim=-1)[1]) ground_truth.append(sample['target'].view(-1)) predictions = torch.cat(predictions) ground_truth = torch.cat(ground_truth) accuracy = (predictions == ground_truth).tolist() # runtimes of baseline mapping (CPU on AMD, GPU on NVIDIA) gt_runtimes = (runtime_cpus if platform == "amd" else runtime_gpus)[test_ids] pred_runtimes = [ (runtime_cpus if pred == 0 else runtime_gpus)[idx] for idx, pred in zip(test_ids, predictions) ] speedup = gt_runtimes / pred_runtimes # record results for benchmark_, o_, p_, accuracy_, p_speedup_ in \ zip(benchmarks[test_ids], ground_truth, predictions, accuracy, speedup): data.append({ "Model": model.__class__.__name__, "Platform": platform_name, 'Benchmark': mapping_metrics.escape_benchmark_name(benchmark_), 'Benchmark Suite': mapping_metrics.escape_suite_name(benchmark_), "Oracle Mapping": o_, "Predicted Mapping": p_, "Accuracy": accuracy_, "Speedup": p_speedup_, }) del model, optimizer performance = pd.DataFrame( data, index=range(1, len(data) + 1), columns=[ "Model", "Platform", "Benchmark", "Benchmark Suite", "Oracle Mapping", "Predicted Mapping", "Accuracy", "Speedup" ]) benchmark_out = performance.groupby(['Platform', 'Benchmark Suite'])[['Platform', 'Accuracy', 'Speedup']].mean() benchmark_out['Accuracy'] = round(benchmark_out['Accuracy'] * 100, 2) benchmark_out['Speedup'] = round(benchmark_out['Speedup'], 2) print(benchmark_out) out = performance.groupby(['Platform'])[['Platform', 'Accuracy', 'Speedup']].mean() out['Accuracy'] = round(out['Accuracy'] * 100, 2) out['Speedup'] = round(out['Speedup'], 2) print(out)
def _main(args, output_file): if args['dataset']['max_tokens'] is None and args['dataset'][ 'max_sentences'] is None: args['dataset']['max_tokens'] = 12000 use_cuda = torch.cuda.is_available() and not args['common']['cpu'] if use_cuda: device = os.environ.get('CUDA_VISIBALE_DEVICES', [0])[0] # get first device as default torch.cuda.set_device(f'cuda:{device}') # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args['dataset']['gen_subset'], shuffle=False) # Load ensemble LOGGER.info('loading model(s) from {}'.format(args['eval']['path'])) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args['eval']['path']), arg_overrides=eval(args['eval']['model_overrides']), task=task, ) # Optimize ensemble for generation for model in models: if _model_args['common']['fp16']: model.half() if use_cuda: model.cuda() # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args['dataset']['gen_subset']), max_tokens=args['dataset']['max_tokens'], max_sentences=args['eval']['max_sentences_eval'], max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=_model_args['dataset'] ['skip_invalid_size_inputs_valid_test'], required_batch_size_multiple=_model_args['dataset'] ['required_batch_size_multiple'], num_shards=_model_args['dataset']['num_shards'], shard_id=_model_args['dataset']['shard_id'], num_workers=_model_args['dataset']['num_workers'], ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=_model_args['common']['log_format'], log_interval=_model_args['common']['log_interval'], default_log_format=('tqdm' if not _model_args['common']['no_progress_bar'] else 'none'), ) sequence_completor = task.build_completor([model], args) accuracy = {'all': 0.} mrr = {'all': 0.} sample_num = {'all': 0.} if task.dataset('test').attrs is not None: for attr in task.dataset('test').attrs: accuracy[attr] = 0. mrr[attr] = 0. sample_num[attr] = 0 def _eval(lprobs, target, idx, num): with torch.no_grad(): lprobs = lprobs[idx] target = target[idx] accuracy = (torch.argmax(lprobs, dim=-1) == target).sum().float().item() # Ref: Code Prediction by Feeding Trees to Transformers # With this practical perspective and for ease of computation, we only consider ranki ≤ 10 for each # location i (all ranki > 10 will have a score of 0). ranks = (lprobs >= lprobs[:, target].diag().unsqueeze(dim=-1)).sum(-1) mrr = 1. / ranks mrr[ranks > 10] = 0. mrr = mrr.sum().float().item() return accuracy, mrr, num for sample in progress: torch.cuda.empty_cache() sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue with torch.no_grad(): net_output = sequence_completor.generate([model], sample, prefix_tokens=None) # lprobs = model.get_normalized_probs(net_output, log_probs=True) lprobs = torch.softmax(net_output[0], dim=-1) lprobs = lprobs.view(-1, lprobs.size(-1)) target = model.get_targets(sample, net_output).view(-1) # all # ignore pad and unk idx = sample['net_input']['src_tokens'].view( -1) != task.target_dictionary.pad() idx[sample['target'].view(-1) == task.target_dictionary.unk()] = 0 # ignore overlapping tokens max_len = sample['target'].size(-1) for i, ext_i in enumerate(sample['extends']): idx[i * max_len:i * max_len + ext_i] = 0 batch_acc, batch_mrr, batch_num = _eval(lprobs, target, idx, num=idx.sum().item()) accuracy['all'] += batch_acc mrr['all'] += batch_mrr sample_num['all'] += batch_num # other attrs if sample['attr_masks'] is not None: for attr, attr_idx in sample['attr_masks'].items(): # pick out attr_idx who are not unk/pad attr_idx = attr_idx[idx[attr_idx].tolist()] if len(attr_idx) > 0: batch_acc, batch_mrr, batch_num = _eval( lprobs, target, attr_idx, num=attr_idx.size) accuracy[attr] += batch_acc mrr[attr] += batch_mrr sample_num[attr] += batch_num for attr in accuracy.keys(): avg_acc = round(accuracy[attr] / sample_num[attr], 6) if sample_num[attr] > 0. else None avg_mrr = round(mrr[attr] / sample_num[attr], 6) if sample_num[attr] > 0. else None print('[{}] tokens, accuracy: {}, MRR: {}'.format( attr, avg_acc, avg_mrr))
def main(args, **unused_kwargs): assert args['eval']['path'] is not None, '--path required for evaluation!' if torch.cuda.is_available() and not args['common']['cpu']: torch.cuda.set_device(args['distributed_training']['device_id']) LOGGER.info(args) # while evaluation, set fraction_using_func_name = 0, namely, not sample from func_name args['task']['fraction_using_func_name'] = 0. use_cuda = torch.cuda.is_available() and not args['common']['cpu'] task = tasks.setup_task(args) # Load ensemble LOGGER.info('loading model(s) from {}'.format(args['eval']['path'])) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args['eval']['path']), arg_overrides=eval(args['eval']['model_overrides']), task=task, ) task = tasks.setup_task(args) # Load dataset splits task.load_dataset(args['dataset']['gen_subset']) dataset = task.dataset(args['dataset']['gen_subset']) # Optimize ensemble for generation and set the source and dest dicts on the model (required by scorer) for model in models: model.make_generation_fast_() if args['common']['fp16']: model.half() if use_cuda: model.cuda() assert len(models) > 0 LOGGER.info('num. model params: {}'.format( sum(p.numel() for p in models[0].parameters()))) itr = task.get_batch_iterator( dataset=dataset, max_tokens=args['dataset']['max_tokens'] or 36000, max_sentences=args['eval']['max_sentences'], max_positions=utils.resolve_max_positions( *[model.max_positions() for model in models]), ignore_invalid_inputs=True, num_shards=args['dataset']['num_shards'], shard_id=args['dataset']['shard_id'], num_workers=args['dataset']['num_workers'], ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args['common']['log_format'], log_interval=args['common']['log_interval'], default_log_format=('tqdm' if not args['common']['no_progress_bar'] else 'none'), ) code_reprs, query_reprs = [], [] for sample in progress: if 'net_input' not in sample: continue sample = utils.move_to_cuda(sample) if use_cuda else sample batch_code_reprs, batch_query_reprs = models[0](**sample['net_input']) code_reprs.extend(batch_code_reprs.tolist()) query_reprs.extend(batch_query_reprs.tolist()) code_reprs = np.asarray(code_reprs, dtype=np.float32) query_reprs = np.asarray(query_reprs, dtype=np.float32) assert code_reprs.shape == query_reprs.shape, (code_reprs.shape, query_reprs.shape) eval_size = len( code_reprs ) if args['eval']['eval_size'] == -1 else args['eval']['eval_size'] k, MRR, topk_idx, topk_prob = 3, [], [], [] for idx in range(len(dataset) // eval_size): code_emb = torch.from_numpy(code_reprs[idx:idx + eval_size, :]).cuda() query_emb = torch.from_numpy(query_reprs[idx:idx + eval_size, :]).cuda() logits = query_emb @ code_emb.t() # src_emb_nrom = torch.norm(code_emb, dim=-1, keepdim=True) + 1e-10 # tgt_emb_nrom = torch.norm(query_emb, dim=-1, keepdim=True) + 1e-10 # logits = (query_emb / tgt_emb_nrom) @ (code_emb / src_emb_nrom).t() correct_scores = logits.diag() compared_scores = logits >= correct_scores.unsqueeze(dim=-1) mrr = 1 / compared_scores.sum(dim=-1).float() MRR.extend(mrr.tolist()) batch_topk_prob, batch_topk_idx = logits.softmax(dim=-1).topk(k) batch_topk_idx = batch_topk_idx + idx * eval_size topk_idx.extend(batch_topk_idx.tolist()) topk_prob.extend(batch_topk_prob.tolist()) if len(dataset) % eval_size: code_emb = torch.from_numpy(code_reprs[-eval_size:, :]).cuda() query_emb = torch.from_numpy(query_reprs[-eval_size:, :]).cuda() logits = query_emb @ code_emb.t() # src_emb_nrom = torch.norm(code_emb, dim=-1, keepdim=True) + 1e-10 # tgt_emb_nrom = torch.norm(query_emb, dim=-1, keepdim=True) + 1e-10 # logits = (query_emb / tgt_emb_nrom) @ (code_emb / src_emb_nrom).t() correct_scores = logits.diag() compared_scores = logits >= correct_scores.unsqueeze(dim=-1) last_ids = len(code_reprs) % eval_size mrr = 1 / compared_scores.sum(dim=-1).float()[-last_ids:] MRR.extend(mrr.tolist()) batch_topk_prob, batch_topk_idx = logits[-last_ids:].softmax( dim=-1).topk(k) batch_topk_idx = batch_topk_idx + len(code_reprs) - eval_size topk_idx.extend(batch_topk_idx.tolist()) topk_prob.extend(batch_topk_prob.tolist()) print('mrr: {:.4f}'.format(np.mean(MRR))) for idx, mrr in enumerate(MRR): if mrr == 1.0 and topk_prob[idx][0] > 0.8: print( np.asarray(topk_idx[idx]) + 1, [round(porb, 4) for porb in topk_prob[idx]])
def hybrid_retrieval_task(args, task, model, use_cuda, input, **kwargs): task.args['dataset']['langs'] = kwargs['lang'] topk = kwargs['topk'] # load code_tokens dataset task.load_dataset(split=args['dataset']['gen_subset']) code_dataset = task.dataset(args['dataset']['gen_subset']) # construct similarities similarities = torch.FloatTensor(len(code_dataset)).fill_(0.0) def cosine_fn(code_emb, query_emb): src_emb_nrom = torch.norm(code_emb, dim=-1, keepdim=True) + 1e-10 tgt_emb_nrom = torch.norm(query_emb, dim=-1, keepdim=True) + 1e-10 similarity = (query_emb / tgt_emb_nrom) @ (code_emb / src_emb_nrom).t() return similarity def softmax_fn(code_emb, query_emb): similarity = query_emb @ code_emb.t() return similarity if args['criterion'] == 'retrieval_cosine': similarity_metrics = cosine_fn elif args['criterion'] == 'retrieval_softmax': similarity_metrics = softmax_fn else: raise NotImplementedError(args['criterion']) # query embeddding query_tokens = task.encode_query_input(input).unsqueeze(dim=0) if use_cuda: query_tokens = utils.move_to_cuda(query_tokens) query_tokens = model.tgt_encoders(query_tokens) # code embeddding code_encoder = model.src_encoders[task.args['dataset']['langs'][0]] for idx, code_tokens in enumerate(code_dataset.src): code_tokens = code_tokens.unsqueeze(dim=0) if use_cuda: code_tokens = utils.move_to_cuda(code_tokens) code_tokens = code_encoder(code_tokens) similarities[idx] = similarity_metrics(code_tokens, query_tokens).item() topk_probs, topk_ids = similarities.topk(k=topk) topk_ids_probs = { idx.item(): round(prob.item() * 100, 4) for prob, idx in zip(topk_probs, topk_ids) } topk_ids = set(topk_ids.tolist()) if 'code_file' in args['eval']: code_raw_file = args['eval']['code_file'] else: default_dir = args['task']['data'][:args['task']['data']. rfind('retrieval')] code_raw_file = os.path.join(default_dir, "attributes", task.args['dataset']['langs'][0], f"test.code") out = [] with open(code_raw_file, 'r') as reader: for idx, line in enumerate(reader): if idx in topk_ids: out.append([line, topk_ids_probs[idx]]) if len(out) == len(topk_ids): break out = sorted(out, key=lambda code_prob: code_prob[-1], reverse=True) return out