Exemple #1
0
def prune_heads(args, model, eval_dataloader, head_mask):
    """This method shows how to prune head (remove heads weights) based on
    the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650)
    """
    # Try pruning and test time speedup
    # Pruning is like masking but we actually remove the masked weights
    before_time = datetime.now()
    _, _, preds, labels = compute_heads_importance(args,
                                                   model,
                                                   eval_dataloader,
                                                   compute_entropy=False,
                                                   compute_importance=False,
                                                   head_mask=head_mask)
    preds = np.argmax(
        preds,
        axis=1) if args.output_mode == "classification" else np.squeeze(preds)
    score_masking = glue_compute_metrics(args.task_name, preds,
                                         labels)[args.metric_name]
    original_time = datetime.now() - before_time

    original_num_params = sum(p.numel() for p in model.parameters())
    heads_to_prune = dict(
        (layer, (1 - head_mask[layer].long()).nonzero().squeeze().tolist())
        for layer in range(len(head_mask)))

    assert sum(
        len(h)
        for h in heads_to_prune.values()) == (1 -
                                              head_mask.long()).sum().item()
    model.prune_heads(heads_to_prune)
    pruned_num_params = sum(p.numel() for p in model.parameters())

    before_time = datetime.now()
    _, _, preds, labels = compute_heads_importance(
        args,
        model,
        eval_dataloader,
        compute_entropy=False,
        compute_importance=False,
        head_mask=None,
        actually_pruned=True,
    )
    preds = np.argmax(
        preds,
        axis=1) if args.output_mode == "classification" else np.squeeze(preds)
    score_pruning = glue_compute_metrics(args.task_name, preds,
                                         labels)[args.metric_name]
    new_time = datetime.now() - before_time

    logger.info(
        "Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)",
        original_num_params,
        pruned_num_params,
        pruned_num_params / original_num_params * 100,
    )
    logger.info("Pruning: score with masking: %f score with pruning: %f",
                score_masking, score_pruning)
    logger.info(
        "Pruning: speed ratio (new timing / original timing): %f percents",
        original_time / new_time * 100)
def adapted_glue_compute_metrics(task_name, preds, labels):
    "Adapted from `glue_compute_metrics` to also handle SNLI."
    try:
      return glue_compute_metrics(task_name, preds, labels)
    except KeyError:
      if task_name in ["snli", "winogrande", "toxic"]:
        # Since MNLI also uses accuracy.
        return glue_compute_metrics("mnli", preds, labels)
    raise KeyError(task_name)
Exemple #3
0
def mask_heads(args, model, eval_dataloader):
    """ This method shows how to mask head (set some heads to zero), to test the effect on the network,
        based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650)
    """
    _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False)
    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
    original_score = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name]
    logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold)

    new_head_mask = torch.ones_like(head_importance)
    num_to_mask = max(1, int(new_head_mask.numel() * args.masking_amount))

    current_score = original_score
    while current_score >= original_score * args.masking_threshold:
        head_mask = new_head_mask.clone()  # save current head mask
        # heads from least important to most - keep only not-masked heads
        head_importance[head_mask == 0.0] = float("Inf")
        current_heads_to_mask = head_importance.view(-1).sort()[1]

        if len(current_heads_to_mask) <= num_to_mask:
            break

        # mask heads
        current_heads_to_mask = current_heads_to_mask[:num_to_mask]
        logger.info("Heads to mask: %s", str(current_heads_to_mask.tolist()))
        new_head_mask = new_head_mask.view(-1)
        new_head_mask[current_heads_to_mask] = 0.0
        new_head_mask = new_head_mask.view_as(head_mask)
        new_head_mask = new_head_mask.clone().detach()
        print_2d_tensor(new_head_mask)

        # Compute metric and head importance again
        _, head_importance, preds, labels = compute_heads_importance(
            args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask
        )
        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
        current_score = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name]
        logger.info(
            "Masking: current score: %f, remaining heads %d (%.1f percents)",
            current_score,
            new_head_mask.sum(),
            new_head_mask.sum() / new_head_mask.numel() * 100,
        )

    logger.info("Final head mask")
    print_2d_tensor(head_mask)
    np.save(os.path.join(args.output_dir, "head_mask.npy"), head_mask.detach().cpu().numpy())

    return head_mask
 def compute_metrics(p: EvalPrediction) -> Dict:
     if output_mode == "classification":
         preds = np.argmax(p.predictions, axis=1)
     elif output_mode == "regression":
         preds = np.squeeze(p.predictions)
     return glue_compute_metrics(data_args.task_name, preds,
                                 p.label_ids)
Exemple #5
0
 def compute_metrics_fn(p: EvalPrediction):
     preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
     if output_mode == "classification":
         preds = np.argmax(preds, axis=1)
     else:  # regression
         preds = np.squeeze(preds)
     return glue_compute_metrics(task_name, preds, p.label_ids)
 def compute_metrics_fn(p: EvalPrediction):
     if output_mode == "classification":
         preds = np.argmax(p.predictions, axis=1)
     elif output_mode == "regression":
         preds = np.squeeze(p.predictions)
     metrics = glue_compute_metrics(task_name, preds, p.label_ids)
     return metrics
Exemple #7
0
def compute_glue_metrics(task_name, p):
    output_mode = glue_output_modes[task_name]

    if output_mode == "classification":
        preds = np.argmax(p.predictions, axis=1)
    elif output_mode == "regression":
        preds = np.squeeze(p.predictions)
    return glue_compute_metrics(task_name, preds, p.label_ids)
 def compute_metrics_fn(p: transformers.EvalPrediction):
     if output_mode == "classification":
         preds = np.argmax(p.predictions, axis=1)
     elif output_mode == "regression":
         preds = np.squeeze(p.predictions)
     if task_name in ynt.genernal_tasks_num_labels:
         return ynt.genernal_compute_metrics(task_name, preds,
                                             p.label_ids)
     elif task_name in transformers.glue_tasks_num_labels:
         return transformers.glue_compute_metrics(
             task_name, preds, p.label_ids)
Exemple #9
0
def evaluate(task_name, model, eval_dataloader, model_type, output_mode = 'classification', device='cuda'):
    # results = {}

    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    for batch_idx, batch in enumerate(eval_dataloader):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            if model_type != "distilbert":
                inputs["token_type_ids"] = (
                    batch[2] if model_type in ["bert", "xlnet", "albert"] else None
                )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs["labels"].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

        progress_bar(batch_idx, len(eval_dataloader), 'Evaluating...')

    eval_loss = eval_loss / nb_eval_steps
    if output_mode == "classification":
        preds = np.argmax(preds, axis=1)
    elif output_mode == "regression":
        preds = np.squeeze(preds)

    result = glue_compute_metrics(task_name, preds, out_label_ids) # [
    # print(result)
    # results.update(result)
    return result
Exemple #10
0
def compute_glue_eval_metrics_regression(task_name: str,
                                         p: EvalPrediction) -> Dict:
    preds = np.squeeze(p.predictions)
    return glue_compute_metrics(task_name, preds, p.label_ids)
Exemple #11
0
def compute_glue_eval_metrics(task_name: str, p: EvalPrediction) -> Dict:
    preds = np.argmax(p.predictions, axis=1)
    return glue_compute_metrics(task_name, preds, p.label_ids)
Exemple #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--n_models", type=int, help="Number of models")
    parser.add_argument("--k",
                        type=int,
                        default=16,
                        help="Number of training instances per label")
    parser.add_argument(
        "--condition",
        type=str,
        help=
        "A dictionary contains conditions that the experiment results need to fulfill (e.g., tag, task_name, few_shot_type)"
    )

    # These options should usually be kept as their default values
    parser.add_argument("--data_dir",
                        type=str,
                        default="data/k-shot",
                        help="Data directory")
    parser.add_argument("--save_logit_dir",
                        type=str,
                        default="ensemble_predict_results",
                        help="Directory to store the logit file.")
    parser.add_argument("--log", type=str, default="log", help="Log path.")
    parser.add_argument("--key",
                        type=str,
                        default='',
                        help="Validation metric name")
    parser.add_argument("--test_key",
                        type=str,
                        default="",
                        help="Test metric name")
    parser.add_argument("--test_key2",
                        type=str,
                        default="",
                        help="Second test metric name")

    args = parser.parse_args()

    condition = eval(args.condition)

    if len(args.key) == 0:
        if condition['task_name'] == 'cola':
            args.key = 'cola_dev_eval_mcc'
            args.test_key = 'cola_test_eval_mcc'
        elif condition['task_name'] == 'mrpc/acc':
            args.key = 'mrpc_dev_eval_acc'
            args.test_key = 'mrpc_test_eval_acc'
            args.test_key2 = 'mrpc_test_eval_f1'
            condition['task_name'] = 'mrpc'
        elif condition['task_name'] == 'mrpc/f1':
            args.key = 'mrpc_dev_eval_f1'
            args.test_key2 = 'mrpc_test_eval_acc'
            args.test_key = 'mrpc_test_eval_f1'
            condition['task_name'] = 'mrpc'
        elif condition['task_name'] == 'qqp/acc':
            args.key = 'qqp_dev_eval_acc'
            args.test_key = 'qqp_test_eval_acc'
            args.test_key2 = 'qqp_test_eval_f1'
            condition['task_name'] = 'qqp'
        elif condition['task_name'] == 'qqp/f1':
            args.key = 'qqp_dev_eval_f1'
            args.test_key2 = 'qqp_test_eval_acc'
            args.test_key = 'qqp_test_eval_f1'
            condition['task_name'] = 'qqp'
        elif condition['task_name'] == 'sts-b/pearson':
            args.key = 'sts-b_dev_eval_pearson'
            args.test_key = 'sts-b_test_eval_pearson'
            args.test_key2 = 'sts-b_test_eval_spearmanr'
            condition['task_name'] = 'sts-b'
        elif condition['task_name'] == 'sts-b/spearmanr':
            args.key = 'sts-b_dev_eval_spearmanr'
            args.test_key2 = 'sts-b_test_eval_pearson'
            args.test_key = 'sts-b_test_eval_spearmanr'
            condition['task_name'] = 'sts-b'
        elif condition['task_name'] == 'qnli':
            args.key = 'qnli_dev_eval_acc'
            args.test_key = 'qnli_test_eval_acc'
        elif condition['task_name'] == 'sst-2':
            args.key = 'sst-2_dev_eval_acc'
            args.test_key = 'sst-2_test_eval_acc'
        elif condition['task_name'] == 'snli':
            args.key = 'snli_dev_eval_acc'
            args.test_key = 'snli_test_eval_acc'
        elif condition['task_name'] == 'mnli':
            args.key = 'mnli_dev_eval_mnli/acc'
            args.test_key = 'mnli_test_eval_mnli/acc'
        elif condition['task_name'] == 'mnli-mm':
            args.key = 'mnli_dev_eval_mnli/acc'
            args.test_key = 'mnli-mm_test_eval_mnli-mm/acc'
        elif condition['task_name'] == 'rte':
            args.key = 'rte_dev_eval_acc'
            args.test_key = 'rte_test_eval_acc'
        elif condition['task_name'] == 'ag_news':
            args.key = 'ag_news_dev_eval_acc'
            args.test_key = 'ag_news_test_eval_acc'
        elif condition['task_name'] == 'yahoo_answers':
            args.key = 'yahoo_answers_dev_eval_acc'
            args.test_key = 'yahoo_answers_test_eval_acc'
        elif condition['task_name'] == 'yelp_review_full':
            args.key = 'yelp_review_full_dev_eval_acc'
            args.test_key = 'yelp_review_full_test_eval_acc'
        elif condition['task_name'] == 'mr':
            args.key = 'mr_dev_eval_acc'
            args.test_key = 'mr_test_eval_acc'
        elif condition['task_name'] == 'sst-5':
            args.key = 'sst-5_dev_eval_acc'
            args.test_key = 'sst-5_test_eval_acc'
        elif condition['task_name'] == 'subj':
            args.key = 'subj_dev_eval_acc'
            args.test_key = 'subj_test_eval_acc'
        elif condition['task_name'] == 'trec':
            args.key = 'trec_dev_eval_acc'
            args.test_key = 'trec_test_eval_acc'
        elif condition['task_name'] == 'cr':
            args.key = 'cr_dev_eval_acc'
            args.test_key = 'cr_test_eval_acc'
        elif condition['task_name'] == 'mpqa':
            args.key = 'mpqa_dev_eval_acc'
            args.test_key = 'mpqa_test_eval_acc'
        else:
            raise NotImplementedError

    with open(args.log) as f:
        result_list = []
        for line in f:
            result_list.append(eval(line))

    seed_result = {}
    seed_best = {}

    # Gather all logs satisfying the conditions
    for item in result_list:
        ok = True
        for cond in condition:
            if cond == 'task_name' and condition['task_name'] == 'mnli-mm':
                if cond not in item or item[cond] != 'mnli':
                    ok = False
                    break
            else:
                if cond not in item or item[cond] != condition[cond]:
                    ok = False
                    break
        if 'model_id' not in item or 'array_id' not in item:
            ok = False

        if ok:
            seed = int(item['data_dir'].split('-')[-1])
            model_id = item['model_id']
            array_id = item['array_id']

            if model_id >= 0 and model_id < args.n_models:
                if seed not in seed_result:
                    seed_result[seed] = {}
                    seed_best[seed] = {}
                if model_id not in seed_result[seed]:
                    seed_result[seed][model_id] = []
                    seed_best[seed][model_id] = {args.key: -1e9}

                seed_result[seed][model_id].append(item)
                if item[args.key] > seed_best[seed][model_id][args.key]:
                    seed_best[seed][model_id] = item

    final_result_dev = np.zeros((len(seed_result), args.n_models))
    final_result_test = np.zeros((len(seed_result), args.n_models))
    final_result_test2 = np.zeros((len(seed_result), args.n_models))

    logit_file_list = {}
    for seed in seed_result:
        logit_file_list[seed] = []

    # Get the results for each model and pick the best dev trial for each model/seed
    for model_id in range(args.n_models):
        for i, seed in enumerate(seed_result):
            final_result_dev[i][model_id] = seed_best[seed][model_id][args.key]
            final_result_test[i][model_id] = seed_best[seed][model_id][
                args.test_key]
            if len(args.test_key2) > 0:
                final_result_test2[i][model_id] = seed_best[seed][model_id][
                    args.test_key2]

            logit_file_list[seed].append("{}-{}-{}.npy".format(
                condition['task_name'], model_id,
                seed_best[seed][model_id]["array_id"]))

        s = "Model %d | val: mean +- std: %.1f +- %.1f | test: mean +- std: %.1f (%.1f) (median %.1f)" % (
            model_id, final_result_dev[:, model_id].mean() * 100,
            final_result_dev[:, model_id].std() * 100,
            final_result_test[:, model_id].mean() * 100,
            final_result_test[:, model_id].std() * 100,
            np.median(final_result_test[:, model_id]) * 100)
        if len(args.test_key2) > 0:
            s += " / %.1f +- %.1f (median %.1f)" % (
                final_result_test2[:, model_id].mean() * 100,
                final_result_test2[:, model_id].std() * 100,
                np.median(final_result_test2[:, model_id]) * 100)
        print(s)

    # Map lower-case names to official names (data folder name)
    data_dir_mapping = {
        'cola': 'CoLA',
        'mrpc': 'MRPC',
        'qqp': 'QQP',
        'sts-b': 'STS-B',
        'sst-2': 'SST-2',
        'snli': 'SNLI',
        'mnli': 'MNLI',
        'mnli-mm': 'MNLI',
        'rte': 'RTE',
        'ag_news': 'ag_news',
        'yahoo_answers': 'yahoo_answers',
        'yelp_review_full': 'yelp_review_full',
        'sst-5': 'sst-5',
        'mr': 'mr',
        'cr': 'cr',
        'mpqa': 'mpqa',
        'subj': 'subj',
        'trec': 'trec'
    }

    tokenizer = AutoTokenizer.from_pretrained('roberta-large')
    ensemble_result = np.zeros((len(seed_result)))
    ensemble_result2 = np.zeros((len(seed_result)))  # for second metric

    # Ensemble for each seed
    for seed_id, seed in enumerate(seed_result):
        labels = get_labels(args.data_dir, args.k, seed,
                            condition['task_name'],
                            data_dir_mapping[condition['task_name']])

        # Logits
        mean_logits = None
        for fname in logit_file_list[seed]:
            logits = np.load(os.path.join(args.save_logit_dir, fname))
            if mean_logits is None:
                mean_logits = logits
            else:
                mean_logits += logits
        mean_logits /= len(logit_file_list[seed])

        # Compute metrics
        preds = mean_logits.argmax(-1)
        if condition['task_name'] in [
                'sst-5', 'mr', 'cr', 'mpqa', 'subj', 'trec'
        ]:
            metric = {"acc": simple_accuracy(preds, labels)}
        else:
            metric = glue_compute_metrics(condition['task_name'], preds,
                                          labels)

        ensemble_result[seed_id] = metric[args.test_key.split('_')[-1]]
        if len(args.test_key2) > 0:
            ensemble_result2[seed_id] = metric[args.test_key2.split('_')[-1]]

    s = "mean +- std: %.1f (%.1f) (median %.1f)" % (
        ensemble_result.mean() * 100, ensemble_result.std() * 100,
        np.median(ensemble_result) * 100)
    if len(args.test_key2) > 0:
        s += " / %.1f (%.1f) (median %.1f)" % (
            ensemble_result2.mean() * 100, ensemble_result2.std() * 100,
            np.median(ensemble_result2) * 100)
    print(s)
Exemple #13
0
 def compute_metrics(p: EvalPrediction) -> Dict:
     return glue_compute_metrics(data_args.task_name, output_mode, p.predictions, p.label_ids)
Exemple #14
0
 def compute_metrics_fn(p: EvalPrediction):
     preds = np.argmax(p.predictions, axis=1)
     return glue_compute_metrics(task_name, preds, p.label_ids)
Exemple #15
0
def evaluate(args,
             model,
             task,
             tokenizer,
             accuracy_matrix,
             train_task_num,
             current_task_num,
             log_matrix,
             prefix=""):
    eval_dataset = load_examples(args, task, tokenizer, evaluate=True)

    if not os.path.exists(os.path.join(args.output_dir, prefix)):
        os.makedirs(os.path.join(args.output_dir, prefix))

    eval_sampler = RandomSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Eval!
    logger.info(
        "***** Running evaluation:: Task : {}, Prefix : {} *****".format(
            task, prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    indexes = None

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[1],
                "attention_mask": batch[2],
                "token_type_ids": batch[3],
                "labels": batch[4]
            }
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]
            eval_loss += tmp_eval_loss.item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            indexes = batch[0].detach().cpu().numpy()
            out_label_ids = inputs["labels"].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            indexes = np.append(indexes,
                                batch[0].detach().cpu().numpy(),
                                axis=0)
            out_label_ids = np.append(out_label_ids,
                                      inputs["labels"].detach().cpu().numpy(),
                                      axis=0)

    eval_loss = eval_loss / nb_eval_steps
    if task == "ner" or task == "pos":
        results = defaultdict()
        preds = np.argmax(preds, axis=2)
        pad_token_label_id = CrossEntropyLoss().ignore_index

        tags_vals = task_processors[task]().get_labels()

        label_map = {i: label for i, label in enumerate(tags_vals)}

        index_list = [] * out_label_ids.shape[0]
        out_label_list = [[] for _ in range(out_label_ids.shape[0])]
        preds_list = [[] for _ in range(out_label_ids.shape[0])]

        for i in range(out_label_ids.shape[0]):
            index_list.append(indexes[i])
            for j in range(out_label_ids.shape[1]):
                if out_label_ids[i, j] != pad_token_label_id:
                    out_label_list[i].append(label_map[out_label_ids[i][j]])
                    preds_list[i].append(label_map[preds[i][j]])

        if task == "ner":
            results = {
                "loss":
                eval_loss,
                "acc":
                accuracy_score(out_label_list, preds_list),
                "precision":
                precision_score(out_label_list, preds_list),
                "recall":
                recall_score(out_label_list, preds_list),
                "f1":
                f1_score(out_label_list, preds_list),
                "classification report":
                classification_report(out_label_list, preds_list),
            }

            sequence_lens = []

            if train_task_num == 0:
                eval_result_file = os.path.join(
                    args.output_dir,
                    "eval_results_" + str(task) + "1" + ".txt")
            else:
                eval_result_file = os.path.join(
                    args.output_dir,
                    "eval_results_" + str(task) + "2" + ".txt")
            with open(eval_result_file, "w") as writer:
                writer.write("index\tlabel\tprediction\n")
                for index1, (item0, item1, item2) in enumerate(
                        zip(index_list, out_label_list, preds_list)):
                    for index2, (label, pred) in enumerate(zip(item1, item2)):
                        if label != pad_token_label_id:
                            if index2 == 0:
                                writer.write(f"{item0}\t{label}\t{pred}\n")
                            else:
                                writer.write(f"\t{label}\t{pred}\n")
                    sequence_lens.append(index2 + 1)
                    writer.write("\n")
                writer.write(str(sum(sequence_lens) / len(sequence_lens)))
            writer.close()

        elif task == "pos":
            results = {
                "loss": eval_loss,
                "acc": accuracy_score(out_label_list, preds_list),
                "precision": precision_score(out_label_list, preds_list),
                "recall": recall_score(out_label_list, preds_list),
                "f1": f1_score(out_label_list, preds_list),
            }

        result = results["acc"]

    else:
        preds = np.argmax(preds, axis=1)

        results = {}
        result = glue_compute_metrics(task, preds, out_label_ids)
        results.update(result)

        # Log evaluation result for the first task CoLA
        if task == "cola":
            # index_list = [] * out_label_ids.shape[0]
            # for i in range(out_label_ids.shape[0]):
            # 	index_list.append(indexes[i])
            tags_vals = task_processors[task]().get_labels()
            label_map = {}
            for i, label in enumerate(tags_vals):
                label_map[label] = i

            if train_task_num == 0:
                eval_result_file = os.path.join(
                    args.output_dir,
                    "eval_results_" + str(task) + "1" + ".txt")
            else:
                eval_result_file = os.path.join(
                    args.output_dir,
                    "eval_results_" + str(task) + "2" + ".txt")
            with open(eval_result_file, "w") as writer:
                writer.write("index\tlabel\tprediction\n")
                for index, (item0, item1, item2) in enumerate(
                        zip(indexes, out_label_ids, preds)):
                    item1 = label_map[str(item1)]
                    item2 = label_map[str(item2)]
                    writer.write(f"{item0}\t{item1}\t{item2}\n")
            writer.close()

        if task == 'cola':
            result = result['mcc']
        else:
            result = result['acc']

    logger.info("***** Eval results {} {}*****".format(prefix, task))
    for key in sorted(results.keys()):
        logger.info(" %s = %s", key, str(results[key]))

    if log_matrix:
        accuracy_matrix[train_task_num][current_task_num] = format(
            result, ".7f")

    return results, accuracy_matrix, result
 def _compute_glue_metrics(self, task_name):
     return lambda p: glue_compute_metrics(
         task_name, np.argmax(p.predictions, axis=1), p.label_ids)
Exemple #17
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (
            os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir)
            and training_args.do_train
            and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    try:
        num_labels = glue_tasks_num_labels[data_args.task_name]
        output_mode = glue_output_modes[data_args.task_name]
    except KeyError:
        raise ValueError("Task not found: %s" % (data_args.task_name))

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )
    tui_ids = None
    if model_args.umls:
        tui_ids = create_cui_dict(voc_updated=model_args.med_document, tokenizer=tokenizer)


    # Get datasets
    train_dataset = (
        GlueDataset(data_args,tokenizer=tokenizer, cache_dir=model_args.cache_dir, tui=tui_ids) if training_args.do_train else None
    )
    eval_dataset = (
        GlueDataset(data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir)
        if training_args.do_eval
        else None
    )
    test_dataset = (
        GlueDataset(data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir)
        if training_args.do_predict
        else None
    )

    def build_compute_metrics_fn(task_name: str) -> Callable[[EvalPrediction], Dict]:
        def compute_metrics_fn(p: EvalPrediction):
            if output_mode == "classification":
                preds = np.argmax(p.predictions, axis=1)
            elif output_mode == "regression":
                preds = np.squeeze(p.predictions)
            return glue_compute_metrics(task_name, preds, p.label_ids)

        return compute_metrics_fn

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=build_compute_metrics_fn(data_args.task_name),
    )

    # Training
    if training_args.do_train:
        trainer.train(
            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
        )
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    eval_results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        eval_datasets = [eval_dataset]
        if data_args.task_name == "mnli":
            pass
            # mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm")
            # eval_datasets.append(
            #     GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir)
            # )

        for eval_dataset in eval_datasets:
            trainer.compute_metrics = build_compute_metrics_fn(eval_dataset.args.task_name)
            eval_result = trainer.evaluate(eval_dataset=eval_dataset)

            output_eval_file = os.path.join(
                training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}.txt"
            )
            if trainer.is_world_master():
                with open(output_eval_file, "w") as writer:
                    logger.info("***** Eval results {} *****".format(eval_dataset.args.task_name))
                    for key, value in eval_result.items():
                        logger.info("  %s = %s", key, value)
                        writer.write("%s = %s\n" % (key, value))

            eval_results.update(eval_result)

    labels_id = []
    if training_args.do_predict:
        logging.info("*** Test ***")
        test_datasets = [test_dataset]
        if data_args.task_name == "mnli":
            pass
            # mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm")
            # test_datasets.append(
            #     GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir)
            # )

        for test_dataset in test_datasets:
            predictions = trainer.predict(test_dataset=test_dataset).predictions
            if output_mode == "classification":
                predictions = np.argmax(predictions, axis=1)
            for i in range(0, len(test_dataset.features)):
                labels_id.append(test_dataset.features[i].label)

            metric = glue_compute_metrics(data_args.task_name, predictions, labels_id)
            output_test_file = os.path.join(
                training_args.output_dir, f"test_results_{test_dataset.args.task_name}_metric.txt"
            )
            if trainer.is_world_master():
                with open(output_test_file, "w") as writer:
                    logger.info("***** test results {} *****".format(test_dataset.args.task_name))
                    for key, value in metric.items():
                        logger.info("  %s = %s", key, value)
                        writer.write("%s = %s\n" % (key, value))

            eval_results.update(eval_result)

            output_test_file = os.path.join(
                training_args.output_dir, f"test_results_{test_dataset.args.task_name}.txt"
            )
            if trainer.is_world_master():
                with open(output_test_file, "w") as writer:
                    logger.info("***** Test results {} *****".format(test_dataset.args.task_name))
                    writer.write("index\tprediction\n")
                    for index, item in enumerate(predictions):
                        if output_mode == "regression":
                            writer.write("%d\t%3.3f\n" % (index, item))
                        else:
                            item = test_dataset.get_labels()[item]
                            writer.write("%d\t%s\n" % (index, item))
    return eval_results
 def compute_metrics_fn(p: EvalPrediction):
     preds = np.argmax(p.predictions, axis=1)
     return glue_compute_metrics("classification", preds, p.label_ids)
def compute_metrics(p: EvalPrediction) -> Dict:
    preds = np.argmax(p.predictions, axis=1)
    return glue_compute_metrics(data_args.task_name, preds, p.label_ids)
        # model.zero_grad()
        losses.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        model.zero_grad()
        # global_step += 1

        # ------
        # Record
        # ------
        preds = logits.data.cpu().numpy()
        preds = np.argmax(preds, axis=1)
        out_label_ids = inputs["labels"].data.cpu().numpy()
        result = glue_compute_metrics(
            task_name, preds, out_label_ids)  # ['acc', 'f1', 'acc_and_f1']
        if recorder is not None:
            recorder.update(losses.item(),
                            acc=[result['acc_and_f1']],
                            batch_size=args.train_batch_size,
                            is_train=True)
            recorder.print_training_result(batch_idx=step,
                                           n_batch=len(train_dataloader))
        else:
            train_loss += losses.item()
            progress_bar(step, len(train_dataloader),
                         "Loss: %.3f" % (train_loss / (step + 1)))

    result = evaluate(task_name, model, eval_dataloader, model_type)
    print(result)
    if recorder is not None: