예제 #1
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Evaluate leaderboard predictions for code completion (line level).')
    parser.add_argument('--answers',
                        '-a',
                        required=True,
                        help="filename of the labels, in txt format.")
    parser.add_argument(
        '--predictions',
        '-p',
        required=True,
        help="filename of the leaderboard predictions, in txt format.")
    args = parser.parse_args()

    preds = open(args.predictions, "r").readlines()
    gts = open(args.answers, "r").readlines()

    assert len(preds) == len(
        gts
    ), f"Samples of predictions and answers are not equal, {len(preds)}: {len(gts)}"

    total = len(gts)
    edit_sim = 0.0
    for pred, gt in zip(preds, gts):
        pred = post_process(pred.strip())
        gt = post_process(gt.strip())
        edit_sim += fuzz.ratio(pred, gt)

    bleu_score = round(_bleu(args.answers, args.predictions), 2)
    logger.info(f"Edit sim: {round(edit_sim/total, 2)}, BLEU: {bleu_score}")
예제 #2
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Evaluate leaderboard predictions for code completion (line level).')
    parser.add_argument('--expected',
                        '-a',
                        required=True,
                        help="filename of the labels, in test format.")
    parser.add_argument(
        '--predicted',
        '-p',
        required=True,
        help="filename of the leaderboard predictions, in txt format.")
    args = parser.parse_args()

    preds = open(args.predicted, "r").readlines()
    gts = open(args.expected, "r").readlines()

    assert len(preds) == len(
        gts
    ), f"Samples of predictions and answers are not equal, {len(preds)}: {len(gts)}"

    total = len(gts)
    EM = 0.0
    for pred, gt in zip(preds, gts):
        pred = pred.strip()
        gt = gt.strip()
        pred = ' '.join([tok.strip() for tok in pred.split()])
        gt = ' '.join([tok.strip() for tok in gt.split()])
        if pred == gt:
            EM += 1

    bleu_score = round(_bleu(args.expected, args.predicted), 2)
    print(f"BLEU: {bleu_score}, EM: {round(EM / total * 100, 2)}")
예제 #3
0
def main():
    import argparse
    parser = argparse.ArgumentParser(description='Evaluate leaderboard predictions for BigCloneBench dataset.')
    parser.add_argument('--references', '-ref',help="filename of the labels, in txt format.")
    parser.add_argument('--predictions', '-pre',help="filename of the leaderboard predictions, in txt format.")
    
    args = parser.parse_args()

    refs = [x.strip() for x in open(args.references, 'r', encoding='utf-8').readlines()]
    pres = [x.strip() for x in open(args.predictions, 'r', encoding='utf-8').readlines()]
    
    assert len(refs) == len(pres)

    length = len(refs)
    count = 0
    for i in range(length):
    	r = refs[i]
    	p = pres[i]
    	if r == p:
    		count += 1
    acc = round(count/length*100, 2)
    
    bleu_score = round(_bleu(args.references, args.predictions),2)
    
    print('BLEU:',  bleu_score, '; Acc:', acc)
예제 #4
0
def main():
    parser = argparse.ArgumentParser(description='Evaluate leaderboard predictions for code completion (line level).')
    parser.add_argument('--answers', '-a', required=True, help="filename of the labels, in json format.")
    parser.add_argument('--predictions', '-p', required=True, help="filename of the leaderboard predictions, in txt format.")
    args = parser.parse_args()

    preds = open(args.predictions, "r").readlines()
    gts = open(args.answers, "r").readlines()

    assert len(preds) == len(gts), f"Samples of predictions and answers are not equal, {len(preds)}: {len(gts)}"

    total = len(gts)
    EM = 0.0
    wf = open("ground_truth.txt", "w")
    for pred, gt in zip(preds, gts):
        pred = pred.strip()
        gt = json.loads(gt)["code"]
        wf.write(gt+"\n")
        if pred.split() == gt.split():
            EM += 1

    bleu_score = round(_bleu("ground_truth.txt", args.predictions), 2)
    logger.info(f"BLEU: {bleu_score}, EM: {round(EM/total*100, 2)}")

    try:
        os.remove("ground_truth.txt")
    except Exception:
        pass
예제 #5
0
def cal_bleu(hyp, ref):

    dev_bleu = round(_bleu(ref, hyp), 2)
    f1 = codecs.open(ref, "r", "utf-8")
    f2 = codecs.open(hyp, "r", "utf-8")
    accs = []
    for l1, l2 in zip(f1.readlines(), f2.readlines()):
        accs.append(l1.strip() == l2.strip())

    print("bleu-4: ", str(dev_bleu))
예제 #6
0
def calculate_scores(references, predictions, topk):
    length = len(references)
    count = 0
    for i in range(length):
        r = references[i]
        p = predictions[i]
        for j in range(topk):
            if p[j] == r:
                count += 1
                break
    acc = count / length * 100
    bleu_score = _bleu(references, predictions[:, :topk].tolist())
    return acc, bleu_score
    pass
예제 #7
0
파일: run.py 프로젝트: modit-team/MODIT
def eval_bleu(args, model, tokenizer, file_type='test', num=99999999):
    dataset = CodeChangeDataset(tokenizer,
                                args,
                                logger,
                                file_type=file_type,
                                block_size=args.block_size,
                                mode='test')
    test_sampler = SequentialSampler(dataset)
    test_dataloader = DataLoader(dataset, sampler=test_sampler, batch_size=1)
    model.to(args.device)
    model.zero_grad()
    model.eval()
    preds = []
    for step, (batch, token_labels) in enumerate(
            tqdm(test_dataloader, total=min(num, len(dataset)))):
        if step >= num:
            break
        inputs = batch.to(args.device)
        with torch.no_grad():
            beam_size = args.beam_size
            m = torch.nn.LogSoftmax(dim=-1)
            outputs = model(inputs)[1]
            p = []
            zero = torch.cuda.LongTensor(1).fill_(0)
            for i in range(inputs.shape[0]):
                past_hidden = []
                for x in outputs:
                    _p = x[:, i:i + 1]
                    _q = _p.expand(-1, beam_size, -1, -1, -1)
                    past_hidden.append(_q)
                # context_mask=source_mask[i:i+1,:].expand(beam_size,-1)
                beam = Beam(beam_size, tokenizer.bos_token_id,
                            tokenizer.eos_token_id)
                input_ids = None
                for _ in range(162):
                    if beam.done():
                        break
                    input_ids = beam.getCurrentState()
                    transformer_outputs = model(input_ids, past=past_hidden)
                    out = m(transformer_outputs[0][:, -1, :]).data
                    beam.advance(out)
                    past_hidden = [
                        x.data.index_select(1, beam.getCurrentOrigin())
                        for x in transformer_outputs[1]
                    ]
                hyp = beam.getHyp(beam.getFinal())
                pred = beam.buildTargetTokens(hyp)[:beam_size]

                pred = [
                    torch.cat([x.view(-1)
                               for x in p] + [zero] * (162 - len(p))).view(
                                   1, -1) for p in pred
                ]
                p.append(torch.cat(pred, 0).unsqueeze(0))
            p = torch.cat(p, 0)
            for pred in p:
                t = pred[0].cpu().numpy()
                t = list(t)
                if 0 in t:
                    t = t[:t.index(0)]
                text = tokenizer.decode(t, clean_up_tokenization_spaces=False)
                preds.append(text)
    golds = []
    datas = read_data(data_dir=args.data_dir, file_type=file_type)
    for (src, tgt) in datas[:num]:
        golds.append(tgt)

    assert len(preds) == len(golds), 'Pred %d\tGold %d' % (len(preds),
                                                           len(golds))

    EM = []
    with open(os.path.join(args.output_dir, f"{file_type}.output"),
              'w',
              encoding='utf-8') as f, open(os.path.join(
                  args.output_dir, f"{file_type}.gold"),
                                           'w',
                                           encoding='utf-8') as f1:
        for pred, gold in zip(preds, golds):
            f.write(pred + '\n')
            f1.write(gold + '\n')
            EM.append(pred.split() == gold.split())

    bleu_score = round(
        _bleu(os.path.join(args.output_dir, f"{file_type}.gold"),
              os.path.join(args.output_dir, f"{file_type}.output")), 2)
    EM = round(np.mean(EM) * 100, 2)
    return bleu_score, EM
예제 #8
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type: e.g. roberta")
    parser.add_argument("--model_name_or_path",
                        default=None,
                        type=str,
                        required=True,
                        help="Path to pre-trained model: e.g. roberta-base")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        required=True,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument(
        "--load_model_path",
        default=None,
        type=str,
        help="Path to trained model: Should contain the .bin files")
    ## Other parameters
    parser.add_argument("--train_filename",
                        default=None,
                        type=str,
                        help="The train filenames (source and target files).")
    parser.add_argument("--dev_filename",
                        default=None,
                        type=str,
                        help="The dev filename. (source and target files).")
    parser.add_argument("--test_filename",
                        default=None,
                        type=str,
                        help="The test filename. (source and target files).")

    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")

    parser.add_argument(
        "--max_source_length",
        default=64,
        type=int,
        help=
        "The maximum total source sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument(
        "--max_target_length",
        default=32,
        type=int,
        help=
        "The maximum total target sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")

    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_test",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Avoid using CUDA when available")

    parser.add_argument("--train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--beam_size",
                        default=10,
                        type=int,
                        help="beam size for beam search")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--eval_steps", default=-1, type=int, help="")
    parser.add_argument("--train_steps", default=-1, type=int, help="")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    # print arguments
    args = parser.parse_args()
    logger.info(args)

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1))
    args.device = device
    # Set seed
    set_seed(args)
    # make dir if output_dir not exist
    if os.path.exists(args.output_dir) is False:
        os.makedirs(args.output_dir)

    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path)
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name, do_lower_case=args.do_lower_case)

    # budild model
    encoder = model_class.from_pretrained(args.model_name_or_path,
                                          config=config)
    decoder_layer = nn.TransformerDecoderLayer(
        d_model=config.hidden_size, nhead=config.num_attention_heads)
    decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
    model = Seq2Seq(encoder=encoder,
                    decoder=decoder,
                    config=config,
                    beam_size=args.beam_size,
                    max_length=args.max_target_length,
                    sos_id=tokenizer.cls_token_id,
                    eos_id=tokenizer.sep_token_id)

    if args.load_model_path is not None:
        logger.info("reload model from {}".format(args.load_model_path))
        model.load_state_dict(torch.load(args.load_model_path))

    model.to(device)
    if args.local_rank != -1:
        # Distributed training
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif args.n_gpu > 1:
        # multi-gpu training
        model = torch.nn.DataParallel(model)

    if args.do_train:
        # Prepare training data loader
        train_examples = read_examples(args.train_filename)
        train_features = convert_examples_to_features(train_examples,
                                                      tokenizer,
                                                      args,
                                                      stage='train')
        all_source_ids = torch.tensor([f.source_ids for f in train_features],
                                      dtype=torch.long)
        all_source_mask = torch.tensor([f.source_mask for f in train_features],
                                       dtype=torch.long)
        all_target_ids = torch.tensor([f.target_ids for f in train_features],
                                      dtype=torch.long)
        all_target_mask = torch.tensor([f.target_mask for f in train_features],
                                       dtype=torch.long)
        train_data = TensorDataset(all_source_ids, all_source_mask,
                                   all_target_ids, all_target_mask)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size //
                                      args.gradient_accumulation_steps)

        num_train_optimization_steps = args.train_steps

        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args.warmup_steps,
            num_training_steps=num_train_optimization_steps)

        # Start training
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info(
            "  Num epoch = %d", num_train_optimization_steps *
            args.train_batch_size // len(train_examples))

        model.train()
        dev_dataset = {}
        nb_tr_examples, nb_tr_steps, tr_loss, global_step, best_bleu, best_loss = 0, 0, 0, 0, 0, 1e6
        bar = range(num_train_optimization_steps)
        train_dataloader = cycle(train_dataloader)
        eval_flag = True
        for step in bar:
            batch = next(train_dataloader)
            batch = tuple(t.to(device) for t in batch)
            source_ids, source_mask, target_ids, target_mask = batch
            loss, _, _ = model(source_ids=source_ids,
                               source_mask=source_mask,
                               target_ids=target_ids,
                               target_mask=target_mask)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            tr_loss += loss.item()
            train_loss = round(
                tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1),
                4)
            #logger.info("  step {} loss {}".format(global_step + 1, train_loss))

            if (global_step + 1) % 100 == 0:
                logger.info("  step {} loss {}".format(global_step + 1,
                                                       train_loss))
            nb_tr_examples += source_ids.size(0)
            nb_tr_steps += 1
            loss.backward()

            if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
                # Update parameters
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
                global_step += 1
                eval_flag = True

            if args.do_eval and ((global_step + 1) % args.eval_steps
                                 == 0) and eval_flag:
                # Eval model with dev dataset
                tr_loss = 0
                nb_tr_examples, nb_tr_steps = 0, 0
                eval_flag = False
                logger.info("Here1")
                if 'dev_loss' in dev_dataset:
                    logger.info("Here2")
                    eval_examples, eval_data = dev_dataset['dev_loss']
                else:
                    logger.info("Here3")
                    eval_examples = read_examples(args.dev_filename)
                    eval_features = convert_examples_to_features(eval_examples,
                                                                 tokenizer,
                                                                 args,
                                                                 stage='dev')
                    all_source_ids = torch.tensor(
                        [f.source_ids for f in eval_features],
                        dtype=torch.long)
                    all_source_mask = torch.tensor(
                        [f.source_mask for f in eval_features],
                        dtype=torch.long)
                    all_target_ids = torch.tensor(
                        [f.target_ids for f in eval_features],
                        dtype=torch.long)
                    all_target_mask = torch.tensor(
                        [f.target_mask for f in eval_features],
                        dtype=torch.long)
                    eval_data = TensorDataset(all_source_ids, all_source_mask,
                                              all_target_ids, all_target_mask)
                    dev_dataset['dev_loss'] = eval_examples, eval_data
                eval_sampler = SequentialSampler(eval_data)
                eval_dataloader = DataLoader(eval_data,
                                             sampler=eval_sampler,
                                             batch_size=args.eval_batch_size)

                logger.info("\n***** Running evaluation *****")
                logger.info("  Num examples = %d", len(eval_examples))
                logger.info("  Batch size = %d", args.eval_batch_size)

                # Start Evaling model
                model.eval()
                eval_loss, tokens_num = 0, 0
                for batch in eval_dataloader:
                    batch = tuple(t.to(device) for t in batch)
                    source_ids, source_mask, target_ids, target_mask = batch

                    with torch.no_grad():
                        _, loss, num = model(source_ids=source_ids,
                                             source_mask=source_mask,
                                             target_ids=target_ids,
                                             target_mask=target_mask)
                    eval_loss += loss.sum().item()
                    tokens_num += num.sum().item()
                # Pring loss of dev dataset
                model.train()
                eval_loss = eval_loss / tokens_num
                result = {
                    'eval_ppl': round(np.exp(eval_loss), 5),
                    'global_step': global_step + 1,
                    'train_loss': round(train_loss, 5)
                }
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                logger.info("  " + "*" * 20)

                # save last checkpoint
                last_output_dir = os.path.join(args.output_dir,
                                               'checkpoint-last')
                if not os.path.exists(last_output_dir):
                    os.makedirs(last_output_dir)
                model_to_save = model.module if hasattr(
                    model, 'module') else model  # Only save the model it-self
                output_model_file = os.path.join(last_output_dir,
                                                 "pytorch_model.bin")
                torch.save(model_to_save.state_dict(), output_model_file)
                if eval_loss < best_loss:
                    logger.info("  Best ppl:%s", round(np.exp(eval_loss), 5))
                    logger.info("  " + "*" * 20)
                    best_loss = eval_loss
                    # Save best checkpoint for best ppl
                    output_dir = os.path.join(args.output_dir,
                                              'checkpoint-best-ppl')
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model,
                        'module') else model  # Only save the model it-self
                    output_model_file = os.path.join(output_dir,
                                                     "pytorch_model.bin")
                    torch.save(model_to_save.state_dict(), output_model_file)

                    # Calculate bleu
                if 'dev_bleu' in dev_dataset:
                    logger.info("Here4")
                    eval_examples, eval_data = dev_dataset['dev_bleu']
                else:
                    logger.info("Here5")
                    eval_examples = read_examples(args.dev_filename)
                    eval_examples = random.sample(
                        eval_examples, min(1000, len(eval_examples)))
                    eval_features = convert_examples_to_features(eval_examples,
                                                                 tokenizer,
                                                                 args,
                                                                 stage='test')
                    all_source_ids = torch.tensor(
                        [f.source_ids for f in eval_features],
                        dtype=torch.long)
                    all_source_mask = torch.tensor(
                        [f.source_mask for f in eval_features],
                        dtype=torch.long)
                    eval_data = TensorDataset(all_source_ids, all_source_mask)
                    dev_dataset['dev_bleu'] = eval_examples, eval_data

                eval_sampler = SequentialSampler(eval_data)
                eval_dataloader = DataLoader(eval_data,
                                             sampler=eval_sampler,
                                             batch_size=args.eval_batch_size)
                logger.info("Here5.5")
                model.eval()
                p = []
                for batch in eval_dataloader:
                    batch = tuple(t.to(device) for t in batch)
                    source_ids, source_mask = batch
                    with torch.no_grad():
                        preds = model(source_ids=source_ids,
                                      source_mask=source_mask)
                        for pred in preds:
                            t = pred[0].cpu().numpy()
                            t = list(t)
                            if 0 in t:
                                t = t[:t.index(0)]
                            text = tokenizer.decode(
                                t, clean_up_tokenization_spaces=False)
                            p.append(text)
                model.train()
                logger.info("Here6")
                predictions = []
                accs = []
                with open(os.path.join(args.output_dir, "dev.output"),
                          'w') as f, open(
                              os.path.join(args.output_dir, "dev.gold"),
                              'w') as f1:
                    for ref, gold in zip(p, eval_examples):
                        predictions.append(str(gold.idx) + '\t' + ref)
                        f.write(ref + '\n')
                        f1.write(gold.target + '\n')
                        accs.append(ref == gold.target)

                dev_bleu = round(
                    _bleu(os.path.join(args.output_dir, "dev.gold"),
                          os.path.join(args.output_dir, "dev.output")), 2)
                logger.info("  %s = %s " % ("bleu-4", str(dev_bleu)))
                logger.info("  %s = %s " %
                            ("xMatch", str(round(np.mean(accs) * 100, 4))))
                logger.info("  " + "*" * 20)
                if dev_bleu > best_bleu:
                    logger.info("  Best bleu:%s", dev_bleu)
                    logger.info("  " + "*" * 20)
                    best_bleu = dev_bleu
                    # Save best checkpoint for best bleu
                    output_dir = os.path.join(args.output_dir,
                                              'checkpoint-best-bleu')
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model,
                        'module') else model  # Only save the model it-self
                    output_model_file = os.path.join(output_dir,
                                                     "pytorch_model.bin")
                    torch.save(model_to_save.state_dict(), output_model_file)

    if args.do_test:
        logger.info("Running Test")

        files = []
        if args.dev_filename is not None:
            files.append(args.dev_filename)
        if args.test_filename is not None:
            files.append(args.test_filename)
        for idx, file in enumerate(files):
            logger.info("Test file: {}".format(file))
            eval_examples = read_examples(file)
            eval_features = convert_examples_to_features(eval_examples,
                                                         tokenizer,
                                                         args,
                                                         stage='test')
            all_source_ids = torch.tensor(
                [f.source_ids for f in eval_features], dtype=torch.long)
            all_source_mask = torch.tensor(
                [f.source_mask for f in eval_features], dtype=torch.long)
            eval_data = TensorDataset(all_source_ids, all_source_mask)

            # Calculate bleu
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)

            model.eval()
            p = []
            for batch in tqdm(eval_dataloader, total=len(eval_dataloader)):
                batch = tuple(t.to(device) for t in batch)
                source_ids, source_mask = batch
                with torch.no_grad():
                    preds = model(source_ids=source_ids,
                                  source_mask=source_mask)
                    for pred in preds:
                        t = pred[0].cpu().numpy()
                        t = list(t)
                        if 0 in t:
                            t = t[:t.index(0)]
                        text = tokenizer.decode(
                            t, clean_up_tokenization_spaces=False)
                        p.append(text)
            model.train()
            predictions = []
            accs = []
            with open(
                    os.path.join(args.output_dir,
                                 "test_{}.output".format(str(idx))),
                    'w') as f, open(
                        os.path.join(args.output_dir,
                                     "test_{}.gold".format(str(idx))),
                        'w') as f1:
                for ref, gold in zip(p, eval_examples):
                    predictions.append(str(gold.idx) + '\t' + ref)
                    f.write(ref + '\n')
                    f1.write(gold.target + '\n')
                    accs.append(ref == gold.target)
            dev_bleu = round(
                _bleu(
                    os.path.join(args.output_dir,
                                 "test_{}.gold".format(str(idx))).format(file),
                    os.path.join(args.output_dir, "test_{}.output".format(
                        str(idx))).format(file)), 2)
            logger.info("  %s = %s " % ("bleu-4", str(dev_bleu)))
            logger.info("  %s = %s " %
                        ("xMatch", str(round(np.mean(accs) * 100, 4))))
            logger.info("  " + "*" * 20)
예제 #9
0
def eval_bleu(args, model, tokenizer, file_type='test', num=20000):
    dataset = MethodDataset(tokenizer,
                            args,
                            file_type='test',
                            block_size=args.block_size,
                            mode='test')
    test_sampler = SequentialSampler(dataset)
    test_dataloader = DataLoader(dataset, sampler=test_sampler, batch_size=1)
    model.to(args.device)
    model.zero_grad()
    model.eval()

    preds = []
    for step, (batch, token_labels) in enumerate(test_dataloader):
        if step >= num:
            break
        inputs = batch.to(args.device)
        max_gen_len = min(256, args.block_size - inputs.shape[1] - 1)
        try:
            with torch.no_grad():
                beam_size = 5
                m = torch.nn.LogSoftmax(dim=-1)
                outputs = model(inputs, return_dict=True).past_key_values
                p = []
                zero = torch.cuda.LongTensor(1).fill_(0)
                for i in range(inputs.shape[0]):
                    past_hidden = tuple(
                        tuple(xx[i:i + 1, :].expand(beam_size, -1, -1, -1)
                              for xx in x) for x in outputs)
                    # past_hidden = [x[:, i:i+1].expand(-1, beam_size, -1, -1, -1) for x in outputs]
                    beam = Beam(beam_size, tokenizer.bos_token_id,
                                [tokenizer.eos_token_id])
                    input_ids = None
                    for _ in range(max_gen_len):
                        if beam.done():
                            break
                        input_ids = beam.getCurrentState()
                        transformer_outputs = model(
                            input_ids,
                            past_key_values=past_hidden,
                            return_dict=True)
                        out = m(transformer_outputs.logits[:, -1, :]).data
                        beam.advance(out)
                        past_hidden = tuple(
                            tuple(
                                xx.data.index_select(
                                    0, beam.getCurrentOrigin()) for xx in x)
                            for x in transformer_outputs.past_key_values)
                        # past_hidden = [x.data.index_select(1, beam.getCurrentOrigin()) for x in transformer_outputs[1]]
                    hyp = beam.getHyp(beam.getFinal())
                    pred = beam.buildTargetTokens(hyp)[:beam_size]

                    pred = [
                        torch.cat([x.view(-1) for x in p] + [zero] *
                                  (max_gen_len - len(p))).view(1, -1)
                        for p in pred
                    ]
                    p.append(torch.cat(pred, 0).unsqueeze(0))
                p = torch.cat(p, 0)
                for pred in p:
                    t = pred[0].cpu().numpy()
                    t = list(t)
                    if 0 in t:
                        t = t[:t.index(0)]
                    text = tokenizer.decode(
                        t, clean_up_tokenization_spaces=False).rstrip("</s>")
                    # print(text)
                    preds.append(text)
        except Exception:
            preds.append("")

        if step % args.logging_steps == 0:
            logger.info(f"{step} are done!")

    golds = []
    datafile = os.path.join(args.data_dir, f"{file_type}.jsonl")
    datas = open(datafile).readlines()
    for x in datas[:num]:
        x = json.loads(x)
        golds.append(x["body"])

    # assert len(preds) == len(golds)

    def post_process(code):
        code = code.replace("<EOL>",
                            "\n").replace("<INDENT>",
                                          " ").replace("<DEDENT>", " ")
        code = code.replace("<NUM_LIT>",
                            "0").replace("<STR_LIT>",
                                         "").replace("<CHAR_LIT>", "")
        pattern = re.compile(r"<(STR|NUM|CHAR)_LIT:(.*?)>", re.S)
        lits = re.findall(pattern, code)
        for lit in lits:
            code = code.replace(f"<{lit[0]}_LIT:{lit[1]}>", lit[1])
        return " ".join(code.split())

    ES = []
    with open(os.path.join(args.output_dir, f"{file_type}.output"),
              'w') as f, open(
                  os.path.join(args.output_dir, f"{file_type}.gold"),
                  'w') as f1:
        for pred, gold in zip(preds, golds):
            pred = post_process(pred)
            gold = post_process(gold)
            f.write(pred + '\n')
            f1.write(gold + '\n')
            ES.append(fuzz.ratio(pred, gold))

    bleu_score = round(
        _bleu(os.path.join(args.output_dir, f"{file_type}.gold"),
              os.path.join(args.output_dir, f"{file_type}.output")), 2)
    ES = round(np.mean(ES), 2)
    print(bleu_score, ES)
예제 #10
0
파일: run.py 프로젝트: microsoft/CodeXGLUE
def eval_bleu(args, model, tokenizer, file_type='test', num=2000):
    dataset = concodeDataset(tokenizer,
                             args,
                             logger,
                             file_type=file_type,
                             block_size=args.block_size,
                             mode='test')
    test_sampler = SequentialSampler(dataset)
    test_dataloader = DataLoader(dataset, sampler=test_sampler, batch_size=1)
    model.to(args.device)
    model.zero_grad()
    model.eval()

    preds = []
    max_gen_len = 100
    for step, (batch, token_labels) in enumerate(test_dataloader):
        if step >= num:
            break
        inputs = batch.to(args.device)
        # with torch.no_grad():
        #     outputs = model.generate(inputs, max_length=args.block_size, num_beams=10, temperature=0.7, early_stopping=False, top_k=70, \
        #               bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id)
        #     # outputs = model.generate(inputs, max_length=args.block_size, do_sample=True, temperature=0.7, top_k=70, top_p=0.95, \
        #     #         bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.pad_token_id, pad_token_id=tokenizer.pad_token_id)
        #     # outputs = model.generate(inputs, max_length=args.block_size, num_beams=10, temperature=0.7, early_stopping=False, top_k=70)
        #     # outputs = model.generate(inputs, max_length=args.block_size, do_sample=True, temperature=0.7, top_k=70, top_p=0.95)
        #     generation = tokenizer.decode(outputs[0])[len(tokenizer.decode(inputs[0])):]
        #     preds.append(generation.rstrip("<pad>"))

        with torch.no_grad():
            beam_size = 10
            m = torch.nn.LogSoftmax(dim=-1)
            outputs = model(inputs)[1]
            p = []
            zero = torch.cuda.LongTensor(1).fill_(0)
            for i in range(inputs.shape[0]):
                # Compatible with transformers version 3.3.0 and 4.13.0
                past = [
                    torch.cat([x[0].unsqueeze(0), x[1].unsqueeze(0)], dim=0)
                    if type(x) == tuple else x for x in outputs
                ]
                past_hidden = [
                    x[:, i:i + 1].expand(-1, beam_size, -1, -1, -1)
                    for x in past
                ]
                # context_mask=source_mask[i:i+1,:].expand(beam_size,-1)
                beam = Beam(beam_size, tokenizer.bos_token_id,
                            tokenizer.eos_token_id)
                input_ids = None
                for _ in range(max_gen_len):
                    if beam.done():
                        break
                    input_ids = beam.getCurrentState()
                    # context_mask=torch.cat((context_mask,input_ids*0+1),-1)
                    # mask=context_mask.unsqueeze(0).unsqueeze(-2).unsqueeze(-2).expand(self.config.n_layer, -1, -1, -1, -1)
                    transformer_outputs = model(input_ids, past=past_hidden)
                    out = m(transformer_outputs[0][:, -1, :]).data
                    # out = self.lsm(self.lm_head(transformer_outputs[0][:,-1,:])).data
                    beam.advance(out)
                    past = [
                        torch.cat([x[0].unsqueeze(0), x[1].unsqueeze(0)],
                                  dim=0) if type(x) == tuple else x
                        for x in transformer_outputs[1]
                    ]
                    past_hidden = [
                        x.data.index_select(1, beam.getCurrentOrigin())
                        for x in past
                    ]
                hyp = beam.getHyp(beam.getFinal())
                pred = beam.buildTargetTokens(hyp)[:beam_size]

                pred = [
                    torch.cat([x.view(-1) for x in p] + [zero] *
                              (max_gen_len - len(p))).view(1, -1) for p in pred
                ]
                p.append(torch.cat(pred, 0).unsqueeze(0))
            p = torch.cat(p, 0)
            for pred in p:
                t = pred[0].cpu().numpy()
                t = list(t)
                if 0 in t:
                    t = t[:t.index(0)]
                text = tokenizer.decode(t, clean_up_tokenization_spaces=False)
                # print(text)
                preds.append(text)

        if step % args.logging_steps == 0:
            logger.info(f"{step} are done!")

    golds = []
    datafile = os.path.join(args.data_dir, f"{file_type}.json")
    datas = open(datafile).readlines()
    for x in datas[:num]:
        x = json.loads(x)
        golds.append(x["code"])

    assert len(preds) == len(golds)

    EM = []
    with open(os.path.join(args.output_dir, f"{file_type}.output"),
              'w') as f, open(
                  os.path.join(args.output_dir, f"{file_type}.gold"),
                  'w') as f1:
        for pred, gold in zip(preds, golds):
            f.write(pred + '\n')
            f1.write(gold + '\n')
            EM.append(pred.split() == gold.split())

    if file_type == "test":
        return 0, 0

    bleu_score = round(
        _bleu(os.path.join(args.output_dir, f"{file_type}.gold"),
              os.path.join(args.output_dir, f"{file_type}.output")), 2)
    EM = round(np.mean(EM) * 100, 2)
    return bleu_score, EM