def main(): parser = argparse.ArgumentParser( description= 'Evaluate leaderboard predictions for code completion (line level).') parser.add_argument('--answers', '-a', required=True, help="filename of the labels, in txt format.") parser.add_argument( '--predictions', '-p', required=True, help="filename of the leaderboard predictions, in txt format.") args = parser.parse_args() preds = open(args.predictions, "r").readlines() gts = open(args.answers, "r").readlines() assert len(preds) == len( gts ), f"Samples of predictions and answers are not equal, {len(preds)}: {len(gts)}" total = len(gts) edit_sim = 0.0 for pred, gt in zip(preds, gts): pred = post_process(pred.strip()) gt = post_process(gt.strip()) edit_sim += fuzz.ratio(pred, gt) bleu_score = round(_bleu(args.answers, args.predictions), 2) logger.info(f"Edit sim: {round(edit_sim/total, 2)}, BLEU: {bleu_score}")
def main(): parser = argparse.ArgumentParser( description= 'Evaluate leaderboard predictions for code completion (line level).') parser.add_argument('--expected', '-a', required=True, help="filename of the labels, in test format.") parser.add_argument( '--predicted', '-p', required=True, help="filename of the leaderboard predictions, in txt format.") args = parser.parse_args() preds = open(args.predicted, "r").readlines() gts = open(args.expected, "r").readlines() assert len(preds) == len( gts ), f"Samples of predictions and answers are not equal, {len(preds)}: {len(gts)}" total = len(gts) EM = 0.0 for pred, gt in zip(preds, gts): pred = pred.strip() gt = gt.strip() pred = ' '.join([tok.strip() for tok in pred.split()]) gt = ' '.join([tok.strip() for tok in gt.split()]) if pred == gt: EM += 1 bleu_score = round(_bleu(args.expected, args.predicted), 2) print(f"BLEU: {bleu_score}, EM: {round(EM / total * 100, 2)}")
def main(): import argparse parser = argparse.ArgumentParser(description='Evaluate leaderboard predictions for BigCloneBench dataset.') parser.add_argument('--references', '-ref',help="filename of the labels, in txt format.") parser.add_argument('--predictions', '-pre',help="filename of the leaderboard predictions, in txt format.") args = parser.parse_args() refs = [x.strip() for x in open(args.references, 'r', encoding='utf-8').readlines()] pres = [x.strip() for x in open(args.predictions, 'r', encoding='utf-8').readlines()] assert len(refs) == len(pres) length = len(refs) count = 0 for i in range(length): r = refs[i] p = pres[i] if r == p: count += 1 acc = round(count/length*100, 2) bleu_score = round(_bleu(args.references, args.predictions),2) print('BLEU:', bleu_score, '; Acc:', acc)
def main(): parser = argparse.ArgumentParser(description='Evaluate leaderboard predictions for code completion (line level).') parser.add_argument('--answers', '-a', required=True, help="filename of the labels, in json format.") parser.add_argument('--predictions', '-p', required=True, help="filename of the leaderboard predictions, in txt format.") args = parser.parse_args() preds = open(args.predictions, "r").readlines() gts = open(args.answers, "r").readlines() assert len(preds) == len(gts), f"Samples of predictions and answers are not equal, {len(preds)}: {len(gts)}" total = len(gts) EM = 0.0 wf = open("ground_truth.txt", "w") for pred, gt in zip(preds, gts): pred = pred.strip() gt = json.loads(gt)["code"] wf.write(gt+"\n") if pred.split() == gt.split(): EM += 1 bleu_score = round(_bleu("ground_truth.txt", args.predictions), 2) logger.info(f"BLEU: {bleu_score}, EM: {round(EM/total*100, 2)}") try: os.remove("ground_truth.txt") except Exception: pass
def cal_bleu(hyp, ref): dev_bleu = round(_bleu(ref, hyp), 2) f1 = codecs.open(ref, "r", "utf-8") f2 = codecs.open(hyp, "r", "utf-8") accs = [] for l1, l2 in zip(f1.readlines(), f2.readlines()): accs.append(l1.strip() == l2.strip()) print("bleu-4: ", str(dev_bleu))
def calculate_scores(references, predictions, topk): length = len(references) count = 0 for i in range(length): r = references[i] p = predictions[i] for j in range(topk): if p[j] == r: count += 1 break acc = count / length * 100 bleu_score = _bleu(references, predictions[:, :topk].tolist()) return acc, bleu_score pass
def eval_bleu(args, model, tokenizer, file_type='test', num=99999999): dataset = CodeChangeDataset(tokenizer, args, logger, file_type=file_type, block_size=args.block_size, mode='test') test_sampler = SequentialSampler(dataset) test_dataloader = DataLoader(dataset, sampler=test_sampler, batch_size=1) model.to(args.device) model.zero_grad() model.eval() preds = [] for step, (batch, token_labels) in enumerate( tqdm(test_dataloader, total=min(num, len(dataset)))): if step >= num: break inputs = batch.to(args.device) with torch.no_grad(): beam_size = args.beam_size m = torch.nn.LogSoftmax(dim=-1) outputs = model(inputs)[1] p = [] zero = torch.cuda.LongTensor(1).fill_(0) for i in range(inputs.shape[0]): past_hidden = [] for x in outputs: _p = x[:, i:i + 1] _q = _p.expand(-1, beam_size, -1, -1, -1) past_hidden.append(_q) # context_mask=source_mask[i:i+1,:].expand(beam_size,-1) beam = Beam(beam_size, tokenizer.bos_token_id, tokenizer.eos_token_id) input_ids = None for _ in range(162): if beam.done(): break input_ids = beam.getCurrentState() transformer_outputs = model(input_ids, past=past_hidden) out = m(transformer_outputs[0][:, -1, :]).data beam.advance(out) past_hidden = [ x.data.index_select(1, beam.getCurrentOrigin()) for x in transformer_outputs[1] ] hyp = beam.getHyp(beam.getFinal()) pred = beam.buildTargetTokens(hyp)[:beam_size] pred = [ torch.cat([x.view(-1) for x in p] + [zero] * (162 - len(p))).view( 1, -1) for p in pred ] p.append(torch.cat(pred, 0).unsqueeze(0)) p = torch.cat(p, 0) for pred in p: t = pred[0].cpu().numpy() t = list(t) if 0 in t: t = t[:t.index(0)] text = tokenizer.decode(t, clean_up_tokenization_spaces=False) preds.append(text) golds = [] datas = read_data(data_dir=args.data_dir, file_type=file_type) for (src, tgt) in datas[:num]: golds.append(tgt) assert len(preds) == len(golds), 'Pred %d\tGold %d' % (len(preds), len(golds)) EM = [] with open(os.path.join(args.output_dir, f"{file_type}.output"), 'w', encoding='utf-8') as f, open(os.path.join( args.output_dir, f"{file_type}.gold"), 'w', encoding='utf-8') as f1: for pred, gold in zip(preds, golds): f.write(pred + '\n') f1.write(gold + '\n') EM.append(pred.split() == gold.split()) bleu_score = round( _bleu(os.path.join(args.output_dir, f"{file_type}.gold"), os.path.join(args.output_dir, f"{file_type}.output")), 2) EM = round(np.mean(EM) * 100, 2) return bleu_score, EM
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type: e.g. roberta") parser.add_argument("--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model: e.g. roberta-base") parser.add_argument( "--tokenizer_name", default="", required=True, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--load_model_path", default=None, type=str, help="Path to trained model: Should contain the .bin files") ## Other parameters parser.add_argument("--train_filename", default=None, type=str, help="The train filenames (source and target files).") parser.add_argument("--dev_filename", default=None, type=str, help="The dev filename. (source and target files).") parser.add_argument("--test_filename", default=None, type=str, help="The test filename. (source and target files).") parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--max_source_length", default=64, type=int, help= "The maximum total source sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument( "--max_target_length", default=32, type=int, help= "The maximum total target sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument("--train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--beam_size", default=10, type=int, help="beam size for beam search") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--eval_steps", default=-1, type=int, help="") parser.add_argument("--train_steps", default=-1, type=int, help="") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") # print arguments args = parser.parse_args() logger.info(args) # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1)) args.device = device # Set seed set_seed(args) # make dir if output_dir not exist if os.path.exists(args.output_dir) is False: os.makedirs(args.output_dir) config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name, do_lower_case=args.do_lower_case) # budild model encoder = model_class.from_pretrained(args.model_name_or_path, config=config) decoder_layer = nn.TransformerDecoderLayer( d_model=config.hidden_size, nhead=config.num_attention_heads) decoder = nn.TransformerDecoder(decoder_layer, num_layers=6) model = Seq2Seq(encoder=encoder, decoder=decoder, config=config, beam_size=args.beam_size, max_length=args.max_target_length, sos_id=tokenizer.cls_token_id, eos_id=tokenizer.sep_token_id) if args.load_model_path is not None: logger.info("reload model from {}".format(args.load_model_path)) model.load_state_dict(torch.load(args.load_model_path)) model.to(device) if args.local_rank != -1: # Distributed training try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: # multi-gpu training model = torch.nn.DataParallel(model) if args.do_train: # Prepare training data loader train_examples = read_examples(args.train_filename) train_features = convert_examples_to_features(train_examples, tokenizer, args, stage='train') all_source_ids = torch.tensor([f.source_ids for f in train_features], dtype=torch.long) all_source_mask = torch.tensor([f.source_mask for f in train_features], dtype=torch.long) all_target_ids = torch.tensor([f.target_ids for f in train_features], dtype=torch.long) all_target_mask = torch.tensor([f.target_mask for f in train_features], dtype=torch.long) train_data = TensorDataset(all_source_ids, all_source_mask, all_target_ids, all_target_mask) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) num_train_optimization_steps = args.train_steps # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=num_train_optimization_steps) # Start training logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info( " Num epoch = %d", num_train_optimization_steps * args.train_batch_size // len(train_examples)) model.train() dev_dataset = {} nb_tr_examples, nb_tr_steps, tr_loss, global_step, best_bleu, best_loss = 0, 0, 0, 0, 0, 1e6 bar = range(num_train_optimization_steps) train_dataloader = cycle(train_dataloader) eval_flag = True for step in bar: batch = next(train_dataloader) batch = tuple(t.to(device) for t in batch) source_ids, source_mask, target_ids, target_mask = batch loss, _, _ = model(source_ids=source_ids, source_mask=source_mask, target_ids=target_ids, target_mask=target_mask) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() train_loss = round( tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) #logger.info(" step {} loss {}".format(global_step + 1, train_loss)) if (global_step + 1) % 100 == 0: logger.info(" step {} loss {}".format(global_step + 1, train_loss)) nb_tr_examples += source_ids.size(0) nb_tr_steps += 1 loss.backward() if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: # Update parameters optimizer.step() optimizer.zero_grad() scheduler.step() global_step += 1 eval_flag = True if args.do_eval and ((global_step + 1) % args.eval_steps == 0) and eval_flag: # Eval model with dev dataset tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 eval_flag = False logger.info("Here1") if 'dev_loss' in dev_dataset: logger.info("Here2") eval_examples, eval_data = dev_dataset['dev_loss'] else: logger.info("Here3") eval_examples = read_examples(args.dev_filename) eval_features = convert_examples_to_features(eval_examples, tokenizer, args, stage='dev') all_source_ids = torch.tensor( [f.source_ids for f in eval_features], dtype=torch.long) all_source_mask = torch.tensor( [f.source_mask for f in eval_features], dtype=torch.long) all_target_ids = torch.tensor( [f.target_ids for f in eval_features], dtype=torch.long) all_target_mask = torch.tensor( [f.target_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_source_ids, all_source_mask, all_target_ids, all_target_mask) dev_dataset['dev_loss'] = eval_examples, eval_data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) logger.info("\n***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Start Evaling model model.eval() eval_loss, tokens_num = 0, 0 for batch in eval_dataloader: batch = tuple(t.to(device) for t in batch) source_ids, source_mask, target_ids, target_mask = batch with torch.no_grad(): _, loss, num = model(source_ids=source_ids, source_mask=source_mask, target_ids=target_ids, target_mask=target_mask) eval_loss += loss.sum().item() tokens_num += num.sum().item() # Pring loss of dev dataset model.train() eval_loss = eval_loss / tokens_num result = { 'eval_ppl': round(np.exp(eval_loss), 5), 'global_step': global_step + 1, 'train_loss': round(train_loss, 5) } for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) logger.info(" " + "*" * 20) # save last checkpoint last_output_dir = os.path.join(args.output_dir, 'checkpoint-last') if not os.path.exists(last_output_dir): os.makedirs(last_output_dir) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(last_output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) if eval_loss < best_loss: logger.info(" Best ppl:%s", round(np.exp(eval_loss), 5)) logger.info(" " + "*" * 20) best_loss = eval_loss # Save best checkpoint for best ppl output_dir = os.path.join(args.output_dir, 'checkpoint-best-ppl') if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) # Calculate bleu if 'dev_bleu' in dev_dataset: logger.info("Here4") eval_examples, eval_data = dev_dataset['dev_bleu'] else: logger.info("Here5") eval_examples = read_examples(args.dev_filename) eval_examples = random.sample( eval_examples, min(1000, len(eval_examples))) eval_features = convert_examples_to_features(eval_examples, tokenizer, args, stage='test') all_source_ids = torch.tensor( [f.source_ids for f in eval_features], dtype=torch.long) all_source_mask = torch.tensor( [f.source_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_source_ids, all_source_mask) dev_dataset['dev_bleu'] = eval_examples, eval_data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) logger.info("Here5.5") model.eval() p = [] for batch in eval_dataloader: batch = tuple(t.to(device) for t in batch) source_ids, source_mask = batch with torch.no_grad(): preds = model(source_ids=source_ids, source_mask=source_mask) for pred in preds: t = pred[0].cpu().numpy() t = list(t) if 0 in t: t = t[:t.index(0)] text = tokenizer.decode( t, clean_up_tokenization_spaces=False) p.append(text) model.train() logger.info("Here6") predictions = [] accs = [] with open(os.path.join(args.output_dir, "dev.output"), 'w') as f, open( os.path.join(args.output_dir, "dev.gold"), 'w') as f1: for ref, gold in zip(p, eval_examples): predictions.append(str(gold.idx) + '\t' + ref) f.write(ref + '\n') f1.write(gold.target + '\n') accs.append(ref == gold.target) dev_bleu = round( _bleu(os.path.join(args.output_dir, "dev.gold"), os.path.join(args.output_dir, "dev.output")), 2) logger.info(" %s = %s " % ("bleu-4", str(dev_bleu))) logger.info(" %s = %s " % ("xMatch", str(round(np.mean(accs) * 100, 4)))) logger.info(" " + "*" * 20) if dev_bleu > best_bleu: logger.info(" Best bleu:%s", dev_bleu) logger.info(" " + "*" * 20) best_bleu = dev_bleu # Save best checkpoint for best bleu output_dir = os.path.join(args.output_dir, 'checkpoint-best-bleu') if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) if args.do_test: logger.info("Running Test") files = [] if args.dev_filename is not None: files.append(args.dev_filename) if args.test_filename is not None: files.append(args.test_filename) for idx, file in enumerate(files): logger.info("Test file: {}".format(file)) eval_examples = read_examples(file) eval_features = convert_examples_to_features(eval_examples, tokenizer, args, stage='test') all_source_ids = torch.tensor( [f.source_ids for f in eval_features], dtype=torch.long) all_source_mask = torch.tensor( [f.source_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_source_ids, all_source_mask) # Calculate bleu eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() p = [] for batch in tqdm(eval_dataloader, total=len(eval_dataloader)): batch = tuple(t.to(device) for t in batch) source_ids, source_mask = batch with torch.no_grad(): preds = model(source_ids=source_ids, source_mask=source_mask) for pred in preds: t = pred[0].cpu().numpy() t = list(t) if 0 in t: t = t[:t.index(0)] text = tokenizer.decode( t, clean_up_tokenization_spaces=False) p.append(text) model.train() predictions = [] accs = [] with open( os.path.join(args.output_dir, "test_{}.output".format(str(idx))), 'w') as f, open( os.path.join(args.output_dir, "test_{}.gold".format(str(idx))), 'w') as f1: for ref, gold in zip(p, eval_examples): predictions.append(str(gold.idx) + '\t' + ref) f.write(ref + '\n') f1.write(gold.target + '\n') accs.append(ref == gold.target) dev_bleu = round( _bleu( os.path.join(args.output_dir, "test_{}.gold".format(str(idx))).format(file), os.path.join(args.output_dir, "test_{}.output".format( str(idx))).format(file)), 2) logger.info(" %s = %s " % ("bleu-4", str(dev_bleu))) logger.info(" %s = %s " % ("xMatch", str(round(np.mean(accs) * 100, 4)))) logger.info(" " + "*" * 20)
def eval_bleu(args, model, tokenizer, file_type='test', num=20000): dataset = MethodDataset(tokenizer, args, file_type='test', block_size=args.block_size, mode='test') test_sampler = SequentialSampler(dataset) test_dataloader = DataLoader(dataset, sampler=test_sampler, batch_size=1) model.to(args.device) model.zero_grad() model.eval() preds = [] for step, (batch, token_labels) in enumerate(test_dataloader): if step >= num: break inputs = batch.to(args.device) max_gen_len = min(256, args.block_size - inputs.shape[1] - 1) try: with torch.no_grad(): beam_size = 5 m = torch.nn.LogSoftmax(dim=-1) outputs = model(inputs, return_dict=True).past_key_values p = [] zero = torch.cuda.LongTensor(1).fill_(0) for i in range(inputs.shape[0]): past_hidden = tuple( tuple(xx[i:i + 1, :].expand(beam_size, -1, -1, -1) for xx in x) for x in outputs) # past_hidden = [x[:, i:i+1].expand(-1, beam_size, -1, -1, -1) for x in outputs] beam = Beam(beam_size, tokenizer.bos_token_id, [tokenizer.eos_token_id]) input_ids = None for _ in range(max_gen_len): if beam.done(): break input_ids = beam.getCurrentState() transformer_outputs = model( input_ids, past_key_values=past_hidden, return_dict=True) out = m(transformer_outputs.logits[:, -1, :]).data beam.advance(out) past_hidden = tuple( tuple( xx.data.index_select( 0, beam.getCurrentOrigin()) for xx in x) for x in transformer_outputs.past_key_values) # past_hidden = [x.data.index_select(1, beam.getCurrentOrigin()) for x in transformer_outputs[1]] hyp = beam.getHyp(beam.getFinal()) pred = beam.buildTargetTokens(hyp)[:beam_size] pred = [ torch.cat([x.view(-1) for x in p] + [zero] * (max_gen_len - len(p))).view(1, -1) for p in pred ] p.append(torch.cat(pred, 0).unsqueeze(0)) p = torch.cat(p, 0) for pred in p: t = pred[0].cpu().numpy() t = list(t) if 0 in t: t = t[:t.index(0)] text = tokenizer.decode( t, clean_up_tokenization_spaces=False).rstrip("</s>") # print(text) preds.append(text) except Exception: preds.append("") if step % args.logging_steps == 0: logger.info(f"{step} are done!") golds = [] datafile = os.path.join(args.data_dir, f"{file_type}.jsonl") datas = open(datafile).readlines() for x in datas[:num]: x = json.loads(x) golds.append(x["body"]) # assert len(preds) == len(golds) def post_process(code): code = code.replace("<EOL>", "\n").replace("<INDENT>", " ").replace("<DEDENT>", " ") code = code.replace("<NUM_LIT>", "0").replace("<STR_LIT>", "").replace("<CHAR_LIT>", "") pattern = re.compile(r"<(STR|NUM|CHAR)_LIT:(.*?)>", re.S) lits = re.findall(pattern, code) for lit in lits: code = code.replace(f"<{lit[0]}_LIT:{lit[1]}>", lit[1]) return " ".join(code.split()) ES = [] with open(os.path.join(args.output_dir, f"{file_type}.output"), 'w') as f, open( os.path.join(args.output_dir, f"{file_type}.gold"), 'w') as f1: for pred, gold in zip(preds, golds): pred = post_process(pred) gold = post_process(gold) f.write(pred + '\n') f1.write(gold + '\n') ES.append(fuzz.ratio(pred, gold)) bleu_score = round( _bleu(os.path.join(args.output_dir, f"{file_type}.gold"), os.path.join(args.output_dir, f"{file_type}.output")), 2) ES = round(np.mean(ES), 2) print(bleu_score, ES)
def eval_bleu(args, model, tokenizer, file_type='test', num=2000): dataset = concodeDataset(tokenizer, args, logger, file_type=file_type, block_size=args.block_size, mode='test') test_sampler = SequentialSampler(dataset) test_dataloader = DataLoader(dataset, sampler=test_sampler, batch_size=1) model.to(args.device) model.zero_grad() model.eval() preds = [] max_gen_len = 100 for step, (batch, token_labels) in enumerate(test_dataloader): if step >= num: break inputs = batch.to(args.device) # with torch.no_grad(): # outputs = model.generate(inputs, max_length=args.block_size, num_beams=10, temperature=0.7, early_stopping=False, top_k=70, \ # bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id) # # outputs = model.generate(inputs, max_length=args.block_size, do_sample=True, temperature=0.7, top_k=70, top_p=0.95, \ # # bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.pad_token_id, pad_token_id=tokenizer.pad_token_id) # # outputs = model.generate(inputs, max_length=args.block_size, num_beams=10, temperature=0.7, early_stopping=False, top_k=70) # # outputs = model.generate(inputs, max_length=args.block_size, do_sample=True, temperature=0.7, top_k=70, top_p=0.95) # generation = tokenizer.decode(outputs[0])[len(tokenizer.decode(inputs[0])):] # preds.append(generation.rstrip("<pad>")) with torch.no_grad(): beam_size = 10 m = torch.nn.LogSoftmax(dim=-1) outputs = model(inputs)[1] p = [] zero = torch.cuda.LongTensor(1).fill_(0) for i in range(inputs.shape[0]): # Compatible with transformers version 3.3.0 and 4.13.0 past = [ torch.cat([x[0].unsqueeze(0), x[1].unsqueeze(0)], dim=0) if type(x) == tuple else x for x in outputs ] past_hidden = [ x[:, i:i + 1].expand(-1, beam_size, -1, -1, -1) for x in past ] # context_mask=source_mask[i:i+1,:].expand(beam_size,-1) beam = Beam(beam_size, tokenizer.bos_token_id, tokenizer.eos_token_id) input_ids = None for _ in range(max_gen_len): if beam.done(): break input_ids = beam.getCurrentState() # context_mask=torch.cat((context_mask,input_ids*0+1),-1) # mask=context_mask.unsqueeze(0).unsqueeze(-2).unsqueeze(-2).expand(self.config.n_layer, -1, -1, -1, -1) transformer_outputs = model(input_ids, past=past_hidden) out = m(transformer_outputs[0][:, -1, :]).data # out = self.lsm(self.lm_head(transformer_outputs[0][:,-1,:])).data beam.advance(out) past = [ torch.cat([x[0].unsqueeze(0), x[1].unsqueeze(0)], dim=0) if type(x) == tuple else x for x in transformer_outputs[1] ] past_hidden = [ x.data.index_select(1, beam.getCurrentOrigin()) for x in past ] hyp = beam.getHyp(beam.getFinal()) pred = beam.buildTargetTokens(hyp)[:beam_size] pred = [ torch.cat([x.view(-1) for x in p] + [zero] * (max_gen_len - len(p))).view(1, -1) for p in pred ] p.append(torch.cat(pred, 0).unsqueeze(0)) p = torch.cat(p, 0) for pred in p: t = pred[0].cpu().numpy() t = list(t) if 0 in t: t = t[:t.index(0)] text = tokenizer.decode(t, clean_up_tokenization_spaces=False) # print(text) preds.append(text) if step % args.logging_steps == 0: logger.info(f"{step} are done!") golds = [] datafile = os.path.join(args.data_dir, f"{file_type}.json") datas = open(datafile).readlines() for x in datas[:num]: x = json.loads(x) golds.append(x["code"]) assert len(preds) == len(golds) EM = [] with open(os.path.join(args.output_dir, f"{file_type}.output"), 'w') as f, open( os.path.join(args.output_dir, f"{file_type}.gold"), 'w') as f1: for pred, gold in zip(preds, golds): f.write(pred + '\n') f1.write(gold + '\n') EM.append(pred.split() == gold.split()) if file_type == "test": return 0, 0 bleu_score = round( _bleu(os.path.join(args.output_dir, f"{file_type}.gold"), os.path.join(args.output_dir, f"{file_type}.output")), 2) EM = round(np.mean(EM) * 100, 2) return bleu_score, EM