def compute_metrics(metric, preds, labels): if not metric in ["squad", "squad_top_recall"]: assert len(preds) == len(labels) if metric == "mcc": return {"mcc": matthews_corrcoef(labels, preds)} elif metric == "acc": return simple_accuracy(preds, labels) elif metric == "acc_f1": return acc_and_f1(preds, labels) elif metric == "pear_spear": return pearson_and_spearman(preds, labels) # TODO this metric seems very specific for NER and doesnt work for other sequence labeling tasks elif metric == "seq_f1": return {"seq_f1": ner_f1_score(labels, preds)} elif metric == "f1_macro": return f1_macro(preds, labels) elif metric == "squad": return squad(preds, labels) elif metric == "squad_top_recall": return squad_N_recall(preds=preds, labels=labels) elif metric == "mse": return {"mse": mean_squared_error(preds, labels)} elif metric == "r2": return {"r2": r2_score(preds, labels)} # elif metric == "masked_accuracy": # return simple_accuracy(preds, labels, ignore=-1) else: raise KeyError(metric)
def compute_metrics(metric, preds, labels): assert len(preds) == len(labels) if metric == "mcc": return {"mcc": matthews_corrcoef(labels, preds)} elif metric == "acc": return simple_accuracy(preds, labels) elif metric == "acc_f1": return acc_and_f1(preds, labels) elif metric == "pear_spear": return pearson_and_spearman(preds, labels) # TODO this metric seems very specific for NER and doesnt work for elif metric == "seq_f1": return {"seq_f1": ner_f1_score(labels, preds)} elif metric == "f1_macro": return f1_macro(preds, labels) elif metric == "squad": return squad(preds, labels) elif metric == "mse": return {"mse": mean_squared_error(preds, labels)} elif metric == "r2": return {"r2": r2_score(preds, labels)} elif metric == "top_n_accuracy": return {"top_n_accuracy": top_n_accuracy(preds, labels)} # elif metric == "masked_accuracy": # return simple_accuracy(preds, labels, ignore=-1) elif metric in registered_metrics: metric_func = registered_metrics[metric] return metric_func(preds, labels) else: raise KeyError(metric)
def compute_metrics(metric, preds, labels): """ Calculate the named metric values for the list of predictions vs list of labels. :param metric: The name of a predefined metric; a function that takes a prediction list and a label list and returns a dict from metric names to values, or recursively a list of metrics. Predefined metrics are: mcc, acc, acc_f1, pear_spear, seq_f1, f1_macro, squad, mse, r2, top_n_accuracy, text_similarity_metric. :type metric: Samples are truncated after this many tokens. :param preds: list of predictions :param labels: list of target labels :return: a dictionary mapping metric names to values. """ assert len(preds) == len(labels) if metric == "mcc": return {"mcc": matthews_corrcoef(labels, preds)} elif metric == "acc": return simple_accuracy(preds, labels) elif metric == "acc_f1": return acc_and_f1(preds, labels) elif metric == "pear_spear": return pearson_and_spearman(preds, labels) # TODO this metric seems very specific for NER and doesnt work for elif metric == "seq_f1": return {"seq_f1": ner_f1_score(labels, preds)} elif metric == "f1_macro": return f1_macro(preds, labels) elif metric == "squad": return squad(preds, labels) elif metric == "mse": return {"mse": mean_squared_error(preds, labels)} elif metric == "r2": return {"r2": r2_score(preds, labels)} elif metric == "top_n_accuracy": return {"top_n_accuracy": top_n_accuracy(preds, labels)} elif metric == "text_similarity_metric": return text_similarity_metric(preds, labels) # elif metric == "masked_accuracy": # return simple_accuracy(preds, labels, ignore=-1) elif isinstance(metric, list): ret = {} for m in metric: ret.update(compute_metrics(m, preds, labels)) return ret elif metric in registered_metrics: metric_func = registered_metrics[metric] return metric_func(preds, labels) else: raise KeyError(metric)
def evaluation(ctx, data_loader, net, intent_pred_loss, slot_pred_loss, slot_vocab): """ Parameters ---------- ctx : Context data_loader : DataLoader net : Block intent_pred_loss : Loss slot_pred_loss : Loss slot_vocab : Vocab Returns ------- avg_intent_loss : float avg_slot_loss : float intent_acc : float slot_f1 : float pred_slots : list gt_slots : list """ nsample = 0 nslot = 0 avg_intent_loss = 0 avg_slot_loss = 0 correct_intent = 0 pred_slots = [] gt_slots = [] for token_ids, mask, selected, slot_ids, intent_label, valid_length in data_loader: token_ids = mx.nd.array(token_ids, ctx=ctx).astype(np.int32) mask = mx.nd.array(mask, ctx=ctx).astype(np.float32) slot_ids = mx.nd.array(slot_ids, ctx=ctx).astype(np.int32) intent_label = mx.nd.array(intent_label, ctx=ctx).astype(np.int32) valid_length = mx.nd.array(valid_length, ctx=ctx).astype(np.float32) batch_nslot = mask.sum().asscalar() batch_nsample = token_ids.shape[0] # Forward network intent_scores, slot_scores = net(token_ids, valid_length) intent_loss = intent_pred_loss(intent_scores, intent_label) slot_loss = slot_pred_loss(slot_scores, slot_ids, mask.expand_dims(axis=-1)) avg_intent_loss += intent_loss.sum().asscalar() avg_slot_loss += slot_loss.sum().asscalar() pred_slot_ids = mx.nd.argmax(slot_scores, axis=-1).astype(np.int32) correct_intent += (mx.nd.argmax(intent_scores, axis=-1).astype(np.int32) == intent_label).sum().asscalar() for i in range(batch_nsample): ele_valid_length = int(valid_length[i].asscalar()) ele_sel = selected[i].asnumpy()[:ele_valid_length] ele_gt_slot_ids = slot_ids[i].asnumpy()[ele_sel] ele_pred_slot_ids = pred_slot_ids[i].asnumpy()[ele_sel] ele_gt_slot_tokens = [slot_vocab.idx_to_token[v] for v in ele_gt_slot_ids] ele_pred_slot_tokens = [slot_vocab.idx_to_token[v] for v in ele_pred_slot_ids] gt_slots.append(ele_gt_slot_tokens) pred_slots.append(ele_pred_slot_tokens) nsample += batch_nsample nslot += batch_nslot avg_intent_loss /= nsample avg_slot_loss /= nslot intent_acc = correct_intent / float(nsample) slot_f1 = ner_f1_score(pred_slots, gt_slots) return avg_intent_loss, avg_slot_loss, intent_acc, slot_f1, pred_slots, gt_slots