def prune_heads(args, model, eval_dataloader, head_mask):
    """ This method shows how to prune head (remove heads weights) based on
        the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650)
    """
    # Try pruning and test time speedup
    # Pruning is like masking but we actually remove the masked weights
    before_time = datetime.now()
    _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
                                                   compute_entropy=False, compute_importance=False, head_mask=head_mask)
    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
    score_masking = compute_metrics(args.task_name, preds, labels)[args.metric_name]
    original_time = datetime.now() - before_time

    original_num_params = sum(p.numel() for p in model.parameters())
    heads_to_prune = dict((layer, (1 - head_mask[layer].long()).nonzero().tolist()) for layer in range(len(head_mask)))
    assert sum(len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item()
    model.prune_heads(heads_to_prune)
    pruned_num_params = sum(p.numel() for p in model.parameters())

    before_time = datetime.now()
    _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
                                                    compute_entropy=False, compute_importance=False, head_mask=None)
    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
    score_pruning = compute_metrics(args.task_name, preds, labels)[args.metric_name]
    new_time = datetime.now() - before_time

    logger.info("Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)", original_num_params, pruned_num_params, pruned_num_params/original_num_params * 100)
    logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning)
    logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time/new_time * 100)
def mask_heads(args, model, eval_dataloader):
    """ This method shows how to mask head (set some heads to zero), to test the effect on the network,
        based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650)
    """
    _, head_importance, preds, labels = compute_heads_importance(
        args, model, eval_dataloader, compute_entropy=False)
    preds = np.argmax(
        preds,
        axis=1) if args.output_mode == "classification" else np.squeeze(preds)
    original_score = compute_metrics(args.task_name, preds,
                                     labels)[args.metric_name]
    logger.info("Pruning: original score: %f, threshold: %f", original_score,
                original_score * args.masking_threshold)

    new_head_mask = torch.ones_like(head_importance)
    num_to_mask = max(1, int(new_head_mask.numel() * args.masking_amount))

    current_score = original_score
    while current_score >= original_score * args.masking_threshold:
        head_mask = new_head_mask.clone()  # save current head mask
        # heads from least important to most - keep only not-masked heads
        head_importance[head_mask == 0.0] = float('Inf')
        current_heads_to_mask = head_importance.view(-1).sort()[1]

        if len(current_heads_to_mask) <= num_to_mask:
            break

        # mask heads
        current_heads_to_mask = current_heads_to_mask[:num_to_mask]
        logger.info("Heads to mask: %s", str(current_heads_to_mask.tolist()))
        new_head_mask = new_head_mask.view(-1)
        new_head_mask[current_heads_to_mask] = 0.0
        new_head_mask = new_head_mask.view_as(head_mask)
        print_2d_tensor(new_head_mask)

        # Compute metric and head importance again
        _, head_importance, preds, labels = compute_heads_importance(
            args,
            model,
            eval_dataloader,
            compute_entropy=False,
            head_mask=new_head_mask)
        preds = np.argmax(
            preds, axis=1
        ) if args.output_mode == "classification" else np.squeeze(preds)
        current_score = compute_metrics(args.task_name, preds,
                                        labels)[args.metric_name]
        logger.info(
            "Masking: current score: %f, remaning heads %d (%.1f percents)",
            current_score, new_head_mask.sum(),
            new_head_mask.sum() / new_head_mask.numel() * 100)

    logger.info("Final head mask")
    print_2d_tensor(head_mask)
    np.save(os.path.join(args.output_dir, 'head_mask.npy'),
            head_mask.detach().cpu().numpy())

    return head_mask
def predict(model, eval_datasets, step, args):
    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (
        args.task_name, )
    eval_output_dir = args.output_dir
    results = {}
    for eval_task, eval_dataset in zip(eval_task_names, eval_datasets):
        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(eval_output_dir)
        logger.info("Predicting...")
        logger.info("***** Running predictions *****")
        logger.info(" task name = %s", eval_task)
        logger.info("  Num  examples = %d", len(eval_dataset))
        logger.info("  Batch size = %d", args.predict_batch_size)
        eval_sampler = SequentialSampler(
            eval_dataset) if args.local_rank == -1 else DistributedSampler(
                eval_dataset)
        eval_dataloader = DataLoader(eval_dataset,
                                     sampler=eval_sampler,
                                     batch_size=args.predict_batch_size)
        model.eval()

        pred_logits = []
        label_ids = []
        for batch in tqdm(eval_dataloader, desc="Evaluating", disable=None):
            input_ids, input_mask, segment_ids, labels = batch
            input_ids = input_ids.to(args.device)
            input_mask = input_mask.to(args.device)
            segment_ids = segment_ids.to(args.device)
            with torch.no_grad():
                logits = model(input_ids, input_mask, segment_ids)
            cpu_logits = logits.detach().cpu()
            for i in range(len(cpu_logits)):
                pred_logits.append(cpu_logits[i].numpy())
                label_ids.append(labels[i])

        pred_logits = np.array(pred_logits)
        label_ids = np.array(label_ids)

        if args.output_mode == "classification":
            preds = np.argmax(pred_logits, axis=1)
        else:  # args.output_mode == "regression":
            preds = np.squeeze(pred_logits)
        result = compute_metrics(eval_task, preds, label_ids)
        logger.info(f"task:,{eval_task}")
        logger.info(f"result: {result}")
        results.update(result)

    output_eval_file = os.path.join(eval_output_dir,
                                    "eval_results-%s.txt" % eval_task)
    with open(output_eval_file, "a") as writer:
        logger.info("***** Eval results {} task {} *****".format(
            step, eval_task))
        writer.write("step: %d ****\n " % step)
        for key in sorted(results.keys()):
            logger.info("%s = %s", key, str(results[key]))
            writer.write("%s = %s\n" % (key, str(results[key])))
    model.train()
    return results
def predict_ens(models, eval_datasets, step, args):
    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (
        args.task_name, )
    eval_output_dir = args.output_dir
    task_results = {}
    for eval_task, eval_dataset in zip(eval_task_names, eval_datasets):
        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(eval_output_dir)
        logger.info("Predicting...")
        logger.info("***** Running predictions *****")
        logger.info(" task name = %s", eval_task)
        logger.info("  Num  examples = %d", len(eval_dataset))
        logger.info("  Batch size = %d", args.predict_batch_size)
        eval_sampler = SequentialSampler(
            eval_dataset) if args.local_rank == -1 else DistributedSampler(
                eval_dataset)
        eval_dataloader = DataLoader(eval_dataset,
                                     sampler=eval_sampler,
                                     batch_size=args.predict_batch_size)
        for model in models:
            model.eval()

        pred_logits = []
        label_ids = []
        for batch in tqdm(eval_dataloader, desc="Evaluating", disable=None):
            input_ids, input_mask, segment_ids, labels = batch
            input_ids = input_ids.to(args.device)
            input_mask = input_mask.to(args.device)
            segment_ids = segment_ids.to(args.device)

            with torch.no_grad():
                logits_list = [
                    model(input_ids, input_mask, segment_ids)
                    for model in models
                ]
            logits = sum(logits_list) / len(logits_list)
            pred_logits.append(logits.detach().cpu())
            label_ids.append(labels)
        pred_logits = np.array(torch.cat(pred_logits), dtype=np.float32)
        label_ids = np.array(torch.cat(label_ids), dtype=np.int64)

        preds = np.argmax(pred_logits, axis=1)
        results = compute_metrics(eval_task, preds, label_ids)

        logger.info("***** Eval results {} task {} *****".format(
            step, eval_task))
        for key in sorted(results.keys()):
            logger.info(f"{eval_task} {key} = {results[key]:.5f}")
        task_results[eval_task] = results

    output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")

    write_results(output_eval_file, step, task_results, eval_task_names)
    for model in models:
        model.train()
    return task_results
def evaluate(args, model, tokenizer, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
    eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)

    results = {}
    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)

        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(eval_output_dir)

        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(eval_dataset))
        logger.info("  Batch size = %d", args.eval_batch_size)
        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            model.eval()
            batch = tuple(t.to(args.device) for t in batch)

            with torch.no_grad():
                inputs = {'input_ids':      batch[0],
                          'attention_mask': batch[1],
                          'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM and RoBERTa don't use segment_ids
                          'labels':         batch[3]}
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs['labels'].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps
        if args.output_mode == "classification":
            preds = np.argmax(preds, axis=1)
        elif args.output_mode == "regression":
            preds = np.squeeze(preds)
        result = compute_metrics(eval_task, preds, out_label_ids)
        results.update(result)

        output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results {} *****".format(prefix))
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    return results
Exemple #6
0
def predict(model, eval_datasets, step, args, examples=None, label_list=None):
    """

    :param model:
    :param eval_datasets:
    :param step:
    :param args:
    :param examples:  样本集合,可以打印前N条样本的预测结果
    :return:
    """
    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (
        args.task_name, )
    eval_output_dir = args.output_dir
    results = {}
    for eval_task, eval_dataset in zip(eval_task_names, eval_datasets):
        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(eval_output_dir)
        logger.info("开始预测...")
        logger.info("***** Running predictions *****")
        logger.info(" 任务名称 = %s", eval_task)
        logger.info("  样本数 = %d", len(eval_dataset))
        logger.info("  Batch size = %d", args.predict_batch_size)
        eval_sampler = SequentialSampler(
            eval_dataset) if args.local_rank == -1 else DistributedSampler(
                eval_dataset)
        eval_dataloader = DataLoader(eval_dataset,
                                     sampler=eval_sampler,
                                     batch_size=args.predict_batch_size)
        model.eval()

        #起始时间
        start_time = time.time()
        pred_logits = []
        label_ids = []
        for batch in tqdm(eval_dataloader, desc="Evaluating", disable=None):
            input_ids, input_mask, segment_ids, labels = batch
            input_ids = input_ids.to(args.device)
            input_mask = input_mask.to(args.device)
            segment_ids = segment_ids.to(args.device)
            with torch.no_grad():
                logits = model(input_ids, input_mask, segment_ids)
            cpu_logits = logits.detach().cpu()
            for i in range(len(cpu_logits)):
                pred_logits.append(cpu_logits[i].numpy())
                label_ids.append(labels[i])

        pred_logits = np.array(pred_logits)
        # 所有真实的labels id
        label_ids = np.array(label_ids)
        if args.output_mode == "classification":
            # 所有预测的label id
            preds = np.argmax(pred_logits, axis=1)
        else:  # args.output_mode == "regression":
            preds = np.squeeze(pred_logits)
        result = compute_metrics(eval_task, preds, label_ids)
        #随机取100个样本,查看结果
        if examples and label_list:
            #抽取100个
            num_example = 100
            print(f"随机打印{num_example}个预测结果")
            total_examples = len(label_ids)
            display_examples = random.sample(range(total_examples),
                                             num_example)
            print("样本          关键字          真实标签         预测标签")
            for exp_idx in display_examples:
                print('%30s  %10s  %8s  %8s' %
                      (examples[exp_idx].text_a, examples[exp_idx].text_b,
                       examples[exp_idx].label, label_list[preds[exp_idx]]))
        if examples and label_list:
            # 保存所有结果到excel
            data_dict = []
            for idx in range(total_examples):
                data_dict.append({
                    '样本': examples[idx].text_a,
                    '关键字': examples[idx].text_b,
                    '真实标签': examples[idx].label,
                    '预测标签': label_list[preds[idx]]
                })
            import pandas as pd
            df = pd.DataFrame(data_dict)
            writer = pd.ExcelWriter("eval_result.xlsx")
            df.to_excel(writer)
            writer.save()
        logger.info(f"task:,{eval_task}")
        logger.info(f"result: {result}")
        results.update(result)

    cost_time = time.time() - start_time
    logger.info(
        f"--- 评估{len(eval_dataset)}条数据的总耗时是 {cost_time} seconds, 每条耗时 {cost_time/len(eval_dataset)} seconds ---"
    )
    output_eval_file = os.path.join(eval_output_dir,
                                    "eval_results-%s.txt" % eval_task)
    with open(output_eval_file, "a") as writer:
        logger.info("***** Eval results {} task {} *****".format(
            step, eval_task))
        writer.write("step: %d ****\n " % step)
        for key in sorted(results.keys()):
            logger.info("%s = %s", key, str(results[key]))
            writer.write("%s = %s\n" % (key, str(results[key])))
    model.train()
    return results
Exemple #7
0
def evaluate(args, model, tokenizer, data_type="dev", prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
    eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)

    results = {}
    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, data_type=data_type)

        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(eval_output_dir)

        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

        # Eval!
        logger.info("***** Running {} evaluation {} *****".format(data_type, prefix))
        logger.info("  Num examples = %d", len(eval_dataset))
        logger.info("  Batch size = %d", args.eval_batch_size)
        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        # epoch_iterator = tqdm(eval_dataloader, desc="Evaluating")
        epoch_iterator = eval_dataloader
        for batch in epoch_iterator:
            model.eval()
            batch = tuple(t.to(args.device) for t in batch)
            input_ids, attention_mask, token_type_ids, labels = batch[0], batch[1], batch[2], batch[3]
            inputs = {'input_ids':      input_ids,
                  'attention_mask': attention_mask,
                  'token_type_ids': token_type_ids if args.model_type in ['bert', 'xlnet'] \
                                             and not args.no_segment else None,  # XLM and RoBERTa don't use segment_ids
                  'labels':         labels}
            with torch.no_grad():
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs['labels'].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps
        if args.output_mode == "classification":
            preds = np.argmax(preds, axis=1)
        elif args.output_mode == "regression":
            preds = np.squeeze(preds)

        if data_type == "dev":
            result = compute_metrics(eval_task, preds, out_label_ids)
            results.update(result)
    
            output_eval_file = os.path.join(eval_output_dir, "eval_{}_results.txt".format(data_type))
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval {} results {} *****".format(data_type, prefix))
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
        else:
            def get_glue_task_name(task_name):
                if task_name == "cola":
                    return "CoLA"
                elif task_name == "sst-2":
                    return "SST-2"
                elif task_name == "mrpc":
                    return "MRPC"
                elif task_name == "sts-b":
                    return "STS-B"
                elif task_name == "qqp":
                    return "QQP"
                elif task_name == "mnli":
                    return "MNLI-m"
                elif task_name == "mnli-mm":
                    return "MNLI-mm"
                elif task_name == "qnli":
                    return "QNLI"
                elif task_name == "rte":
                    return "RTE"
                elif task_name == "wnli":
                    return "WNLI"
                else:
                    raise KeyError(task_name)
    
            label_list = processors[args.task_name]().get_labels()
            if args.task_name in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']:
                # HACK(label indices are swapped in RoBERTa pretrained model)
                label_list[1], label_list[2] = label_list[2], label_list[1]
            headers = ["index", "predictioin"]
            file_saved = "{}.tsv".format(get_glue_task_name(eval_task))
            output_test_file = os.path.join(eval_output_dir, file_saved)
            with open(output_test_file, "w", encoding='utf-8') as f:
                logger.info("Save {} as GLUE data format".format(file_saved))
                writer = csv.writer(f, delimiter="\t")
                writer.writerow(headers)
                for index, pred in enumerate(preds):
                    if label_list[0] is None:
                        writer.writerow([index, str(pred)])
                    else:
                        writer.writerow([index, label_list[pred]])

        # print evaluation logs             
        log_string = "Job_{}:".format(args.job_id)
        log_string += " {}_{}:".format(eval_task, data_type)
        if prefix != "":
            log_string += " step={:<8d}".format(prefix)
        log_string += " {}_loss={:<8.5f}".format(data_type, eval_loss)
        for key in sorted(results.keys()):
            log_string += " {}_{}={:<8.5f}".format(data_type, key, results[key])
        logger.info(log_string)

    return results
def evaluate(args, model, tokenizer, prefix="", test=False):

    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (
        args.task_name, )
    eval_outputs_dirs = (args.output_dir, args.output_dir +
                         '-MM') if args.task_name == "mnli" else (
                             args.output_dir, )

    results = {}
    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
        if test:
            eval_dataset = load_and_cache_examples(args,
                                                   eval_task,
                                                   tokenizer,
                                                   evaluate=True,
                                                   test_data=True)
        else:
            eval_dataset = load_and_cache_examples(args,
                                                   eval_task,
                                                   tokenizer,
                                                   evaluate=True,
                                                   test_data=False)

        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(eval_output_dir)

        args.eval_batch_size = args.per_gpu_eval_batch_size * max(
            1, args.n_gpu)
        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(
            eval_dataset) if args.local_rank == -1 else DistributedSampler(
                eval_dataset)
        eval_dataloader = DataLoader(eval_dataset,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(eval_dataset))
        logger.info("  Batch size = %d", args.eval_batch_size)
        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            model.eval()
            batch = tuple(t.to(args.device) for t in batch)

            with torch.no_grad():
                inputs = {
                    'input_ids':
                    batch[0],
                    'attention_mask':
                    batch[1],
                    'token_type_ids':
                    batch[2] if args.model_type in ['bert', 'xlnet'] else
                    None,  # XLM and RoBERTa don't use segment_ids
                    'labels':
                    batch[3]
                }
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs['labels'].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(
                    out_label_ids,
                    inputs['labels'].detach().cpu().numpy(),
                    axis=0)

        eval_loss = eval_loss / nb_eval_steps
        if args.output_mode == "classification":
            preds_prob = copy.deepcopy(preds)
            preds = np.argmax(preds, axis=1)
        elif args.output_mode == "regression":
            preds = np.squeeze(preds)
        result = compute_metrics(eval_task, preds, out_label_ids)
        logger.info('dev length ' + str(len(preds)) + '     1 nums ' +
                    str(preds.sum()))
        if test:
            if not args.do_entity:
                test_o = pd.read_csv(
                    '/userhome/project/data_final/Test_Data_Title_processed_anzhaochusai.csv'
                )
                test_save = pd.DataFrame({
                    'id': test_o['id'],
                    'negative': preds,
                    'key_entity': np.nan
                })
                test_save.to_csv(os.path.join(eval_output_dir, 'result.csv'),
                                 index=False)
                np.save(os.path.join(eval_output_dir, 'test_prob.npy'),
                        preds_prob)
            else:
                test_o = pd.read_csv(os.path.join(args.data_dir, 'test.tsv'),
                                     sep='\t')
                test_save = pd.DataFrame({
                    'id': test_o['index'],
                    'negative': preds
                })
                test_save.to_csv(os.path.join(eval_output_dir, 'result.csv'),
                                 index=False)
                np.save(os.path.join(eval_output_dir, 'test_prob.npy'),
                        preds_prob)
        if prefix == 'save_eval_result':
            eval_o = pd.read_csv(os.path.join(args.data_dir, 'dev.tsv'),
                                 sep='\t')
            eval_save = pd.DataFrame({
                'sentence': eval_o['sentence'],
                'negative': preds,
                '0prob': preds_prob[:, 0],
                '1prob': preds_prob[:, 1]
            })
            eval_save.to_csv(os.path.join(eval_output_dir, 'eval_result.csv'),
                             index=False)

        results.update(result)

        output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results {} *****".format(prefix))
            for key in sorted(result.keys()):
                logger.info("  %s = %s\n", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    return results
Exemple #9
0
def evaluate(args, model, tokenizer, prefix="", TEST=False):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
    eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)

    results = {}
    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
        if TEST:
            eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True, TEST=TEST)
        else:
            eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)

        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(eval_output_dir)

        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(eval_dataset))
        logger.info("  Batch size = %d", args.eval_batch_size)
        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            model.eval()
            batch = tuple(t.to(args.device) for t in batch)

            with torch.no_grad():
                inputs = {'input_ids':      batch[0],
                          'attention_mask': batch[1],
                          'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM and RoBERTa don't use segment_ids
                          'labels':         batch[3]}
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs['labels'].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps
        if args.output_mode == "classification":
            if TEST:
                scores = preds[:,1]
            preds = np.argmax(preds, axis=1)
        elif args.output_mode == "regression":
            preds = np.squeeze(preds)
        if TEST:
            # do the full evaluation of Max F1, PR-AUC and P@1
            DATA_FOLDER = os.path.join("..", "..", "train_data")
            test_file = os.path.join(DATA_FOLDER, "test_shortest_count2_squad_final_train_data_features_with_new_info.tsv")
            test_data = pd.read_csv(test_file, sep='\t', header=None)
            test_bucket_indices = verify_and_generate_bucket_indices(test_data, last_column_index=104)
            acc, cm, roc_auc, pr_auc, ap, f1_max, p_max, r_max, precision, recall, thresholds, MRR, precision_at_1, counter_all_pos, classification_report, classification_report_str = my_evaluate(scores, preds, out_label_ids, test_bucket_indices)
            print("Accuracy:{}".format(acc))
            print("ROC_AUC_SCORE:{}".format(roc_auc))
            print("PR_AUC_score:{}".format(pr_auc))
            print("Average Precision Score:{}".format(ap))
            print("Max F1:{}".format(f1_max))
            print("Precision for max F1:{}".format(p_max))
            print("Recall for max F1:{}".format(r_max))
            print("MRR:{}".format(MRR))
            print("Precision@1:{}".format(precision_at_1))

            print("All Pos. Counter:\n{}".format(counter_all_pos))
            print("CM:\n{}".format(cm))
            print("Classification report:\n{}".format(classification_report_str))
            print("\n\n\n\n")
        result = compute_metrics(eval_task, preds, out_label_ids)
        results.update(result)

        output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results {} *****".format(prefix))
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    return results
Exemple #10
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    warmup_steps = args.warmup_steps if args.warmup_steps >= 1 else int(t_total * args.warmup_steps)
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank,
                                                          find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    # train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
    train_iterator = range(int(args.num_train_epochs))
    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
    first_time = time.time()
    best_result = 0.0
    for idx_epoch in train_iterator:
        # epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        epoch_iterator = train_dataloader
        preds = None
        out_label_ids = None
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            input_ids, attention_mask, token_type_ids, labels = batch[0], batch[1], batch[2], batch[3]
            inputs = {'input_ids':      input_ids,
                  'attention_mask': attention_mask,
                  'token_type_ids': token_type_ids if args.model_type in ['bert', 'xlnet'] \
                                             and not args.no_segment else None,  # XLM and RoBERTa don't use segment_ids
                  'labels':         labels}
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
            if args.n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                total_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
            else:
                loss.backward()
                total_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

            tr_loss += loss.item()
            if preds is None:
                preds = outputs[1].detach().cpu().numpy()
                out_label_ids = inputs['labels'].detach().cpu().numpy()
            else:
                preds = np.append(preds, outputs[1].detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer, data_type="dev", prefix=global_step)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                    # current loss
                    cur_loss =  (tr_loss - logging_loss) / args.logging_steps
                    tb_writer.add_scalar('loss', cur_loss, global_step)
                    logging_loss = tr_loss
                    # print log
                    log_string = "Job_{}:".format(args.job_id)
                    log_string += " epoch={:<3d}".format(idx_epoch)
                    log_string += " step={:<8d}".format(global_step)
                    log_string += " batch={:<4d}".format(labels.shape[0])
                    log_string += " lr={:<10.7f}".format(scheduler.get_lr()[0])
                    log_string += " train_loss={:<8.5f}".format(cur_loss)
                    log_string += " |g|={:<10.7f}".format(total_norm)
                    # calculate accuracy
                    if args.output_mode == "classification":
                        preds = np.argmax(preds, axis=1)
                    elif args.output_mode == "regression":
                        preds = np.squeeze(preds)
                    result = compute_metrics(args.task_name, preds, out_label_ids)
                    for key in sorted(result.keys()):
                        log_string += " {}_{}={:<8.5f}".format("train", key, result[key])
                    log_string += " mins={:<9.2f}".format(float(time.time() - first_time) / 60)
                    logger.info(log_string)
                    preds = None
                    out_label_ids = None

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

        if args.local_rank == -1 and not args.evaluate_during_training and args.evaluate_after_training:  # Only evaluate when single GPU otherwise metrics may not average well
            results = evaluate(args, model, tokenizer, data_type="dev", prefix=global_step)
            metrics = result_for_sorting(args.task_name, results)
            if metrics >= best_result:
                best_result = metrics
                # Save model checkpoint
                output_dir = os.path.join(args.output_dir, 'best')
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
                model_to_save.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)
                torch.save(args, os.path.join(output_dir, 'training_args.bin'))
                logger.info("Saving model checkpoint to %s", output_dir)

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Exemple #11
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    tb_writer = SummaryWriter(os.path.join(args.output_dir, 'runs'))

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    #train_iterator = trange(int(args.num_train_epochs), desc="Epoch")
    #set_seed(args)  # Added here for reproductibility keeping the seed the same...
    # TODO(robinjia): does calling set_seed a second time matter?
    train_results = {}
    for epoch in range(int(args.num_train_epochs)):
        preds = None
        out_label_ids = None
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids':
                batch[0],
                'attention_mask':
                batch[1],
                'token_type_ids':
                batch[2] if args.model_type in ['bert', 'xlnet'] else
                None,  # XLM don't use segment_ids
                'labels':
                batch[3]
            }
            outputs = model(**inputs)
            loss, logits = outputs[:
                                   2]  # model outputs are always tuple in pytorch-transformers (see doc)

            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs['labels'].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(
                    out_label_ids,
                    inputs['labels'].detach().cpu().numpy(),
                    axis=0)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        raise NotImplementedError
                        # TODO: make evaluation happen below
                        #results = evaluate(args, model, tokenizer)
                        #for key, value in results.items():
                        #    tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
                    tb_writer.add_scalar('lr',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, 'module'
                    ) else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.save_every_epoch:
            output_dir = os.path.join(args.output_dir,
                                      'checkpoint-epoch{}'.format(epoch))
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            model_to_save = model.module if hasattr(
                model, 'module'
            ) else model  # Take care of distributed/parallel training
            model_to_save.save_pretrained(output_dir)
            torch.save(args, os.path.join(output_dir, 'training_args.bin'))
            logger.info("Saving model checkpoint to %s", output_dir)
        if args.output_mode == "classification":
            preds = np.argmax(preds, axis=1)
        elif args.output_mode == "regression":
            preds = np.squeeze(preds)
        results = compute_metrics(args.task_name, preds, out_label_ids)
        train_results[epoch] = results
        print("Train results: ", train_results)
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    tb_writer.close()
    #TODO, hacky but saves more significant restructuring...
    args.train_results = train_results
    return global_step, tr_loss / global_step, train_results
def evaluate(args, model, tokenizer, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
    eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)

    results = {}
    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)

        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(eval_output_dir)

        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(eval_dataset))
        logger.info("  Batch size = %d", args.eval_batch_size)
        eval_loss = 0.0
        nb_eval_steps = 0
        eval_accuracy = 0
        nb_eval_examples = 0
        preds = None
        all_out_label_ids = None
        n_classes = GLUE_TASKS_NUM_LABELS[args.task_name]
        print(args.task_name)
        print(n_classes)
        all_true_pos = [0.0 for c in range(n_classes)]
        all_true_neg = [0.0 for c in range(n_classes)]
        all_false_pos = [0.0 for c in range(n_classes)]
        all_false_neg = [0.0 for c in range(n_classes)]
        
        all_precision = [0.0 for c in range(n_classes)]
        all_recall = [0.0 for c in range(n_classes)]
        all_f1 = [0.0 for c in range(n_classes)]
        
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            model.eval()
            
            batch = tuple(t.to(args.device) for t in batch)

            with torch.no_grad():
                inputs = {'input_ids':      batch[0],
                          'attention_mask': batch[1],
                          'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
                          'labels':         batch[3]}
     
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]
            
            eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            nb_eval_examples += inputs['input_ids'].size(0)
            print(nb_eval_examples)
            
            out_label_ids = inputs['labels']
            
            print("out_label_ids")
            print(out_label_ids)
            
            if(args.task_name == "multilabel"):
                metric_results = metrics_with_thresh(logits, out_label_ids)
            
                eval_accuracy += metric_results['accuracy']
                
                all_true_pos = [all_true_pos[c] + metric_results['true_pos'][c] for c in range(n_classes)]
                all_true_neg = [all_true_neg[c] + metric_results['true_neg'][c] for c in range(n_classes)]
                all_false_pos = [all_false_pos[c] + metric_results['false_pos'][c] for c in range(n_classes)]
                all_false_neg = [all_false_neg[c] + metric_results['false_neg'][c] for c in range(n_classes)]
            
            if all_out_label_ids is None:
                all_out_label_ids = out_label_ids.detach().cpu().numpy()
            else:
                all_out_label_ids = np.append(all_out_label_ids, out_label_ids.detach().cpu().numpy(), axis=0)
            

            if preds is None:
                preds = logits.detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                
        if(args.task_name == "multilabel"):

        
            print(all_true_pos)
            print(all_true_neg)
            print(all_false_pos)
            print(all_false_neg)
            
            for c in range(n_classes):
                try: all_precision[c] = all_true_pos[c] / (all_true_pos[c] + all_false_pos[c])
                except ZeroDivisionError: all_precision[c] = float("Inf")
                try: all_recall[c] = all_true_pos[c] / (all_true_pos[c] + all_false_neg[c])
                except ZeroDivisionError: all_recall[c] = float("Inf")
                try: all_f1[c] = 2 * all_precision[c] * all_recall[c] / (all_precision[c] + all_recall[c])
                except ZeroDivisionError: all_f1[c] = float("Inf")
                
           
    
            eval_loss = eval_loss / nb_eval_steps
            eval_accuracy = eval_accuracy / nb_eval_steps # if it was divided by nb_eval_examples, calculated accuracy would be 1/batch_size of real accuracy

        if args.output_mode == "classification":
            preds = np.argmax(preds, axis=1)
            result = compute_metrics(eval_task, preds, all_out_label_ids)
        elif args.output_mode == "regression":
            preds = np.squeeze(preds)
            result = compute_metrics(eval_task, preds, all_out_label_ids)
        elif args.output_mode == "multi-classification":
            #     ROC-AUC calcualation
            # Compute ROC curve and ROC area for each class
            print(len(all_out_label_ids[0]))
            if len(all_out_label_ids[0]) > 1:
                fpr = dict()
                tpr = dict()
                roc_auc = dict()
                
                for i in range(len(inputs['labels'][0])):
                    print(len(inputs['labels']))
                    print(i)
                    fpr[i], tpr[i], _ = roc_curve(all_out_label_ids[:, i], preds[:, i])
                    roc_auc[i] = auc(fpr[i], tpr[i])
                    
                # Compute micro-average ROC curve and ROC area
                fpr["micro"], tpr["micro"], _ = roc_curve(all_out_label_ids.ravel(), preds.ravel())
                roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    
                result = {'eval_loss': eval_loss,
                          'eval_accuracy': eval_accuracy,
                          'roc_auc': roc_auc,
                          'precision': all_precision,
                          'recall': all_recall,
                          'f1': all_f1}
            
            else:
                result = {'eval_loss': eval_loss,
                          'eval_accuracy': eval_accuracy,
                          'precision': all_precision,
                          'recall': all_recall,
                          'f1': all_f1}
        
        results.update(result)

        output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results {} *****".format(prefix))
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    return results
Exemple #13
0
    def query(self,
              examples,
              batch_size,
              do_evaluate=True,
              return_logits=False,
              do_recover=True,
              use_tqdm=True):
        if do_recover:
            examples = [self.recoverer.recover_example(x) for x in examples]
        dataset = self._prep_examples(examples)
        eval_sampler = SequentialSampler(
            dataset)  # Makes sure order is correct
        eval_dataloader = DataLoader(dataset,
                                     sampler=eval_sampler,
                                     batch_size=batch_size)

        # Eval!
        logger.info("***** Querying model *****")
        logger.info("  Num examples = %d", len(examples))
        logger.info("  Batch size = %d", batch_size)
        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        example_idxs = None
        self.model.eval()
        if use_tqdm:
            eval_dataloader = tqdm(eval_dataloader, desc="Querying")
        for batch in eval_dataloader:
            batch = tuple(t.to(self.device) for t in batch)

            with torch.no_grad():
                inputs = {
                    'input_ids':
                    batch[0],
                    'attention_mask':
                    batch[1],
                    'token_type_ids':
                    batch[2] if self.model_type in ['bert', 'xlnet'] else
                    None,  # XLM don't use segment_ids
                    'labels':
                    batch[3]
                }
                outputs = self.model(**inputs)
                inputs['example_idxs'] = batch[4]
                tmp_eval_loss, logits = outputs[:2]

                eval_loss += tmp_eval_loss.mean().item()

            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs['labels'].detach().cpu().numpy()
                example_idxs = inputs['example_idxs'].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(
                    out_label_ids,
                    inputs['labels'].detach().cpu().numpy(),
                    axis=0)
                example_idxs = np.append(
                    example_idxs,
                    inputs['example_idxs'].detach().cpu().numpy(),
                    axis=0)

        eval_loss = eval_loss / nb_eval_steps
        logger.info('  eval_loss = %.6f', eval_loss)
        incorrect_example_indices = None
        if self.output_mode == "classification":
            pred_argmax = np.argmax(preds, axis=1)
            pred_labels = [
                self.label_list[pred_argmax[i]] for i in range(len(examples))
            ]
            incorrect_example_indices = set(example_idxs[np.not_equal(
                pred_argmax, out_label_ids)])

        elif self.output_mode == "regression":
            preds = np.squeeze(preds)

        if do_evaluate:
            result = compute_metrics(self.task_name, pred_argmax,
                                     out_label_ids)
            output_eval_file = os.path.join(
                self.output_dir, "eval-{}.txt".format(self.task_name))
            #print("Possible predictions: ", set(list(preds)))
            #priny("Model predictions: mean: {}, max: {}, min: {}".format(preds.mean(), preds.max(), preds.min()))
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

        if return_logits:
            return preds
        else:
            return pred_labels
Exemple #14
0
def evaluate(args, model, tokenizer, prefix="", dev_evaluate=False):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    if "mnli" in args.task_name and "mnli-mm" not in args.task_name:
        args.eval_task_names.append("mnli-mm")

    results = {}
    for eval_task in args.eval_task_names:
        # eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
        eval_dataset, eval_labels, num_classes = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True,\
                dev_evaluate=dev_evaluate)

        print("num_classes ", num_classes, "eval_labels ", eval_labels)

        print(eval_dataset)
        args.eval_batch_size = args.per_gpu_eval_batch_size
        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(eval_dataset))
        logger.info("  Batch size = %d", args.eval_batch_size)
        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None

        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            model.eval()
            batch = tuple(t.to(args.device) for t in batch)

            with torch.no_grad():
                if args.hypothesis_only or args.focal_loss or args.poe_loss:
                    inputs = {
                        'input_ids':
                        batch[0],
                        'attention_mask':
                        batch[1],
                        'token_type_ids':
                        batch[2]
                        if args.model_type in ['bert', 'xlnet'] else None,
                        # XLM don't use segment_ids
                        'labels':
                        batch[3],
                        'h_ids':
                        batch[4],
                        'h_attention_mask':
                        batch[5]
                    }
                elif args.hans_only:
                    inputs = {
                        'input_ids':
                        batch[0],
                        'attention_mask':
                        batch[1],
                        'token_type_ids':
                        batch[2]
                        if args.model_type in ['bert', 'xlnet'] else None,
                        # XLM don't use segment_ids
                        'labels':
                        batch[3],
                        'h_ids':
                        batch[4],
                        'h_attention_mask':
                        batch[5],
                        'p_ids':
                        batch[6],
                        'p_attention_mask':
                        batch[7],
                        'have_overlap':
                        batch[8],
                        'overlap_rate':
                        batch[9],
                        'subsequence':
                        batch[10],
                        'constituent':
                        batch[11]
                    }
                else:
                    inputs = {
                        'input_ids':
                        batch[0],
                        'attention_mask':
                        batch[1],
                        'token_type_ids':
                        batch[2]
                        if args.model_type in ['bert', 'xlnet'] else None,
                        # XLM don't use segment_ids
                        'labels':
                        batch[3]
                    }

                outputs = model(**inputs)["bert"]
                tmp_eval_loss, logits = outputs[:2]
                eval_loss += tmp_eval_loss.mean().item()

            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs['labels'].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(
                    out_label_ids,
                    inputs['labels'].detach().cpu().numpy(),
                    axis=0)

        eval_loss = eval_loss / nb_eval_steps

        max_preds = np.argmax(preds, axis=1)

        # convert 1,2 labels to 1 in case of binary dataset.
        if num_classes == 2 and args.binerize_eval:
            max_preds = binarize_preds(max_preds)
            out_label_ids = binarize_preds(out_label_ids)

        if eval_task in nli_task_names:
            eval_task_metric = "nli"
        elif eval_task.startswith("fever"):
            eval_task_metric = "fever"
        elif eval_task.startswith("HANS"):
            eval_task_metric = "hans"
        else:
            eval_task_metric = eval_task

        result = compute_metrics(eval_task_metric, max_preds, out_label_ids)

        if args.save_labels_file is not None:
            save_labels_file = args.save_labels_file + "_" + eval_task
            if args.output_label_format == "kaggle":
                write_in_kaggle_format(args, max_preds, eval_labels,
                                       save_labels_file, eval_task)
            elif args.output_label_format == "numpy":
                write_in_numpy_format(args, preds, save_labels_file)

        results[eval_task] = result["acc"]
        if eval_task.startswith("HANS"):
            results[eval_task + "_not-entailment"] = result["acc_0"]
            results[eval_task + "_entailment"] = result["acc_1"]
        print("results is ", result, " eval_task ", eval_task)

    return results, preds
Exemple #15
0
def evaluate(args, model, tokenizer, epoch=0, is_test=False):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_task_names = args.task_name
    eval_output_dir = args.output_dir

    set_type = 'test' if is_test else 'dev'
    results = {}
    for task_id, eval_task in enumerate(eval_task_names):
        if is_test and not hasattr(processors[eval_task], 'get_test_examples'):
            continue

        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer,
                                               set_type)

        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(eval_output_dir)

        args.eval_batch_size = args.per_gpu_eval_batch_size * max(
            1, args.n_gpu)
        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(
            eval_dataset) if args.local_rank == -1 else DistributedSampler(
                eval_dataset)
        eval_dataloader = DataLoader(eval_dataset,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        # Eval!
        logger.info(
            "***** Running evaluation for {} on {} for epoch {} *****".format(
                eval_task, set_type, epoch))
        logger.info("  Num examples = %d", len(eval_dataset))
        logger.info("  Batch size = %d", args.eval_batch_size)
        eval_loss = 0.0
        nb_eval_steps = 0
        logits_all = None
        out_label_ids = None
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            model.eval()
            batch = tuple(t.to(args.device) for t in batch)

            with torch.no_grad():
                inputs = {
                    'input_ids': batch[0],
                    'attention_mask': batch[1],
                    'token_type_ids': batch[2],  # XLM don't use segment_ids
                    'labels': batch[3],
                    'task_id': task_id
                }
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]
                # input_ids, input_mask, segment_ids, label_ids = batch
                # tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids, task_id=task_id)

                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if logits_all is None:
                logits_all = logits.detach().cpu().numpy()
                out_label_ids = inputs['labels'].detach().cpu().numpy()
            else:
                logits_all = np.append(logits_all,
                                       logits.detach().cpu().numpy(),
                                       axis=0)
                out_label_ids = np.append(
                    out_label_ids,
                    inputs['labels'].detach().cpu().numpy(),
                    axis=0)

        eval_loss = eval_loss / nb_eval_steps
        output_mode = output_modes[eval_task]
        if output_mode in ["classification", "multi-choice"]:
            preds = np.argmax(logits_all, axis=1)
        elif output_mode == "regression":
            preds = np.squeeze(logits_all)
        result = compute_metrics(eval_task, preds, out_label_ids.reshape(-1))
        results.update(result)

        output_eval_file = os.path.join(
            eval_output_dir,
            "eval_results_{}_{}.txt".format(eval_task, set_type))
        with open(output_eval_file, "a") as writer:
            logger.info(
                "***** Eval results for {} on {} for epoch {} *****".format(
                    eval_task, set_type, epoch))
            writer.write(
                "***** Eval results for epoch {} *****\n".format(epoch))
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
            logger.info("\n")

        # get error idx
        correct_idx = np.argwhere(preds == out_label_ids).tolist()
        wrong_idx = np.argwhere(preds != out_label_ids).tolist()
        wrong_idx_dict = {
            'correct': correct_idx,
            'wrong': wrong_idx,
            'preds': preds.tolist(),
            'logits': logits_all.tolist(),
            'labels': out_label_ids.tolist()
        }
        json.dump(
            wrong_idx_dict,
            open(
                os.path.join(
                    eval_output_dir,
                    "error_idx_{}_{}.json".format(eval_task, set_type)), 'w'))

    return results
def evaluate(args, model, tokenizer, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_task = args.task_name
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args,
                                           eval_task,
                                           tokenizer,
                                           evaluate=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'token_type_ids': batch[2],
                'labels': batch[3]
            }
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids,
                                      inputs['labels'].detach().cpu().numpy(),
                                      axis=0)

    eval_loss = eval_loss / nb_eval_steps
    preds = np.argmax(preds, axis=1)

    result = compute_metrics(eval_task, preds, out_label_ids)

    output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result
Exemple #17
0
def evaluate(args,
             model,
             tokenizer,
             prefix="",
             eval_set='dev',
             save_aps=False):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (
        args.task_name, )
    eval_outputs_dirs = (args.output_dir, args.output_dir +
                         '-MM') if args.task_name == "mnli" else (
                             args.output_dir, )

    results = {}
    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer,
                                               eval_set)

        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(eval_output_dir)

        if eval_set == 'train' and save_aps:  #for getting the correct losses for each query
            args.eval_batch_size = 2
        else:
            args.eval_batch_size = args.per_gpu_eval_batch_size * max(
                1, args.n_gpu)
        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(eval_dataset))
        logger.info("  Batch size = %d", args.eval_batch_size)
        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        all_losses = []
        for batch in eval_dataloader:
            model.eval()
            batch = tuple(t.to(args.device) for t in batch)

            with torch.no_grad():
                inputs = {
                    'input_ids':
                    batch[0],
                    'attention_mask':
                    batch[1],
                    'token_type_ids':
                    batch[2] if args.model_type in ['bert', 'xlnet'] else
                    None,  # XLM don't use segment_ids
                    'labels':
                    batch[3]
                }
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

                all_losses.append(tmp_eval_loss.mean().item())

                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs['labels'].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(
                    out_label_ids,
                    inputs['labels'].detach().cpu().numpy(),
                    axis=0)
            if args.debug_mode:
                break

        eval_loss = eval_loss / nb_eval_steps
        if args.task_name == "ms_v2" or args.task_name == "udc" or \
            args.task_name == "mantis_10" or args.task_name == "mantis_50":
            preds = softmax(preds, axis=1)
            preds = preds[:, 1]
        elif args.output_mode == "classification":
            preds = np.argmax(preds, axis=1)
        elif args.output_mode == "regression":
            preds = np.squeeze(preds)
        if save_aps:
            assert args.local_rank == -1
            aps = compute_aps(preds, out_label_ids)
            output_eval_file = os.path.join(eval_output_dir,
                                            "aps_" + args.run_name)
            with open(output_eval_file, "w") as f:
                for ap in aps:
                    f.write(str(ap) + "\n")

            output_eval_file = os.path.join(eval_output_dir,
                                            "losses_" + args.run_name)
            with open(output_eval_file, "w") as f:
                for loss in all_losses:
                    f.write(str(loss) + "\n")

            output_eval_file = os.path.join(eval_output_dir,
                                            "preds_" + args.run_name)
            with open(output_eval_file, "w") as f:
                for pred in preds:
                    f.write(str(pred) + "\n")

            negative_sampled_size = 2
            preds_q_docs_avg = []
            for i in range(0, len(preds), negative_sampled_size):
                preds_q_docs_avg.append(
                    sum(preds[i:i + negative_sampled_size]) /
                    negative_sampled_size)
            output_eval_file = os.path.join(eval_output_dir,
                                            "avg_preds_" + args.run_name)
            with open(output_eval_file, "w") as f:
                for avg in preds_q_docs_avg:
                    f.write(str(avg) + "\n")

        result = compute_metrics(eval_task, preds, out_label_ids)
        results.update(result)

        output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results {} *****".format(prefix))
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    return results