def evaluate(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) # eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,) if args.stats_dir is None: args.stats_dir = args.output_dir eval_outputs_dirs = (args.stats_dir, ) if args.task_name == "mnli" else ( args.stats_dir, ) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset, label_list = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler( eval_dataset) if args.local_rank == -1 else DistributedSampler( eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "xlnet"] else None ) # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds_prob = np.max(F.softmax(torch.tensor(preds), dim=-1).cpu().numpy(), axis=1) preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) results.update({ "preds": list(label_list[int(x)] for x in preds), "preds_prob": list(float(x) for x in preds_prob) }) try: if args.passed_examples: return results except: pass result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) eval_output_dir = Path(eval_output_dir) json.dump(results, open(eval_output_dir / "results.json", "w"), indent=2) try: c_report = classification_report(out_label_ids, preds, target_names=label_list, labels=list(range( len(label_list)))) print(c_report) cm = ConfusionMatrix([label_list[x] for x in out_label_ids], [label_list[x] for x in preds]) str_cm = str(cm) print(str_cm) str_stats = cm._str_stats() print(str_stats) cm.plot().get_figure().savefig(eval_output_dir / "output.png") except: c_report, str_cm, str_stats = "", "", "" output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write("\n" + c_report + "\n") writer.write("\n" + str_cm + "\n") writer.write("\n" + str_stats + "\n") return results
outputData = pd.read_csv(predictionsFN, header=0) targetsFilePath = path.join(jsonCall['test_data'], '..', 'SCORE', 'targets.csv') targetsData = pd.read_csv(targetsFilePath, header=0).fillna('') if len(outputData) != len(targetsData): raise Exception('Number of outputs does not match number of targets, %d vs %d' % (outputData, targetsData)) outputC = [ outputData[score_label_name][i] for i in range(len(targetsData)) if targetsData[score_label_name][i] != '' ] targetsC = [ x for x in targetsData[score_label_name] if x != '' ] lbls = sorted(set(outputC+targetsC)) #acc = np.mean(targetsC == outputC) acc = len([ i for i in range(len(outputC)) if outputC[i] == targetsC[i]]) / len(outputC) confmat = sklearn.metrics.confusion_matrix(targetsC, outputC, lbls) confmat_norm = confmat.astype('float') / confmat.sum(axis=1)[:, np.newaxis] np.set_printoptions(precision=3) np.set_printoptions(threshold=np.nan) pd_confmat = ConfusionMatrix(targetsC, outputC) with open(evalFN, 'w') as evalF: evalF.write('Accuracy: %f\n' % (acc)) #evalF.write('Classes: \n%s\n\n' % (" ".join(lbls))) evalF.write('Confusion matrix normalized:\n%s\n\n' % (confmat_norm)) evalF.write('Confusion matrix: \n%s\n\n' % (pd_confmat)) evalF.write('%s' % (pd_confmat._str_stats())) with open(evalFN, 'r') as evalF: for line in evalF: print(line)