def evaluate(args, model, tokenizer, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (
        args.task_name, )
    # eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,)
    if args.stats_dir is None:
        args.stats_dir = args.output_dir

    eval_outputs_dirs = (args.stats_dir, ) if args.task_name == "mnli" else (
        args.stats_dir, )

    results = {}
    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
        eval_dataset, label_list = load_and_cache_examples(args,
                                                           eval_task,
                                                           tokenizer,
                                                           evaluate=True)

        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(eval_output_dir)

        args.eval_batch_size = args.per_gpu_eval_batch_size * max(
            1, args.n_gpu)
        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(
            eval_dataset) if args.local_rank == -1 else DistributedSampler(
                eval_dataset)
        eval_dataloader = DataLoader(eval_dataset,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(eval_dataset))
        logger.info("  Batch size = %d", args.eval_batch_size)
        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            model.eval()
            batch = tuple(t.to(args.device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "labels": batch[3]
                }
                if args.model_type != "distilbert":
                    inputs["token_type_ids"] = (
                        batch[2]
                        if args.model_type in ["bert", "xlnet"] else None
                    )  # XLM, DistilBERT and RoBERTa don't use segment_ids
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs["labels"].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(
                    out_label_ids,
                    inputs["labels"].detach().cpu().numpy(),
                    axis=0)
        eval_loss = eval_loss / nb_eval_steps
        if args.output_mode == "classification":
            preds_prob = np.max(F.softmax(torch.tensor(preds),
                                          dim=-1).cpu().numpy(),
                                axis=1)
            preds = np.argmax(preds, axis=1)
        elif args.output_mode == "regression":
            preds = np.squeeze(preds)

        results.update({
            "preds": list(label_list[int(x)] for x in preds),
            "preds_prob": list(float(x) for x in preds_prob)
        })
        try:
            if args.passed_examples:
                return results
        except:
            pass
        result = compute_metrics(eval_task, preds, out_label_ids)
        results.update(result)
        eval_output_dir = Path(eval_output_dir)

        json.dump(results,
                  open(eval_output_dir / "results.json", "w"),
                  indent=2)

        try:
            c_report = classification_report(out_label_ids,
                                             preds,
                                             target_names=label_list,
                                             labels=list(range(
                                                 len(label_list))))
            print(c_report)
            cm = ConfusionMatrix([label_list[x] for x in out_label_ids],
                                 [label_list[x] for x in preds])
            str_cm = str(cm)
            print(str_cm)

            str_stats = cm._str_stats()
            print(str_stats)
            cm.plot().get_figure().savefig(eval_output_dir / "output.png")

        except:
            c_report, str_cm, str_stats = "", "", ""

        output_eval_file = os.path.join(eval_output_dir, prefix,
                                        "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results {} *****".format(prefix))
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

            writer.write("\n" + c_report + "\n")
            writer.write("\n" + str_cm + "\n")
            writer.write("\n" + str_stats + "\n")

    return results
Beispiel #2
0
outputData = pd.read_csv(predictionsFN, header=0)

targetsFilePath = path.join(jsonCall['test_data'], '..', 'SCORE', 'targets.csv')
targetsData = pd.read_csv(targetsFilePath, header=0).fillna('')

if len(outputData) != len(targetsData):
    raise Exception('Number of outputs does not match number of targets, %d vs %d' % (outputData, targetsData))

outputC = [ outputData[score_label_name][i] for i in range(len(targetsData)) if targetsData[score_label_name][i] != '' ]
targetsC = [ x for x in targetsData[score_label_name] if x != '' ]
lbls = sorted(set(outputC+targetsC))

#acc = np.mean(targetsC == outputC)
acc = len([ i for i in range(len(outputC)) if outputC[i] == targetsC[i]]) / len(outputC)
confmat = sklearn.metrics.confusion_matrix(targetsC, outputC, lbls)
confmat_norm = confmat.astype('float') / confmat.sum(axis=1)[:, np.newaxis]
np.set_printoptions(precision=3)
np.set_printoptions(threshold=np.nan)
pd_confmat = ConfusionMatrix(targetsC, outputC)
with open(evalFN, 'w') as evalF:
    evalF.write('Accuracy: %f\n' % (acc))
    #evalF.write('Classes: \n%s\n\n' % (" ".join(lbls)))
    evalF.write('Confusion matrix normalized:\n%s\n\n' % (confmat_norm))
    evalF.write('Confusion matrix: \n%s\n\n' % (pd_confmat))
    evalF.write('%s' % (pd_confmat._str_stats()))

with open(evalFN, 'r') as evalF:
    for line in evalF:
        print(line)