Example #1
0
def compute_metrics(metric, preds, labels):
    if not metric in ["squad", "squad_top_recall"]:
        assert len(preds) == len(labels)
    if metric == "mcc":
        return {"mcc": matthews_corrcoef(labels, preds)}
    elif metric == "acc":
        return simple_accuracy(preds, labels)
    elif metric == "acc_f1":
        return acc_and_f1(preds, labels)
    elif metric == "pear_spear":
        return pearson_and_spearman(preds, labels)
    # TODO this metric seems very specific for NER and doesnt work for other sequence labeling tasks
    elif metric == "seq_f1":
        return {"seq_f1": ner_f1_score(labels, preds)}
    elif metric == "f1_macro":
        return f1_macro(preds, labels)
    elif metric == "squad":
        return squad(preds, labels)
    elif metric == "squad_top_recall":
        return squad_N_recall(preds=preds, labels=labels)
    elif metric == "mse":
        return {"mse": mean_squared_error(preds, labels)}
    elif metric == "r2":
        return {"r2": r2_score(preds, labels)}
    # elif metric == "masked_accuracy":
    #     return simple_accuracy(preds, labels, ignore=-1)
    else:
        raise KeyError(metric)
Example #2
0
def compute_metrics(metric, preds, labels):
    assert len(preds) == len(labels)
    if metric == "mcc":
        return {"mcc": matthews_corrcoef(labels, preds)}
    elif metric == "acc":
        return simple_accuracy(preds, labels)
    elif metric == "acc_f1":
        return acc_and_f1(preds, labels)
    elif metric == "pear_spear":
        return pearson_and_spearman(preds, labels)
    # TODO this metric seems very specific for NER and doesnt work for
    elif metric == "seq_f1":
        return {"seq_f1": ner_f1_score(labels, preds)}
    elif metric == "f1_macro":
        return f1_macro(preds, labels)
    elif metric == "squad":
        return squad(preds, labels)
    elif metric == "mse":
        return {"mse": mean_squared_error(preds, labels)}
    elif metric == "r2":
        return {"r2": r2_score(preds, labels)}
    elif metric == "top_n_accuracy":
        return {"top_n_accuracy": top_n_accuracy(preds, labels)}
    # elif metric == "masked_accuracy":
    #     return simple_accuracy(preds, labels, ignore=-1)
    elif metric in registered_metrics:
        metric_func = registered_metrics[metric]
        return metric_func(preds, labels)
    else:
        raise KeyError(metric)
Example #3
0
def compute_metrics(metric, preds, labels):
    """
    Calculate the named metric values for the list of predictions vs list of labels.

    :param metric: The name of a predefined metric; a function that takes a prediction list and a label
        list and returns a dict from metric names to values, or recursively a list of metrics.
        Predefined metrics are: mcc, acc, acc_f1, pear_spear, seq_f1, f1_macro, squad, mse, r2,
        top_n_accuracy, text_similarity_metric.
    :type metric: Samples are truncated after this many tokens.
    :param preds: list of predictions
    :param labels: list of target labels
    :return: a dictionary mapping metric names to values.
    """
    assert len(preds) == len(labels)
    if metric == "mcc":
        return {"mcc": matthews_corrcoef(labels, preds)}
    elif metric == "acc":
        return simple_accuracy(preds, labels)
    elif metric == "acc_f1":
        return acc_and_f1(preds, labels)
    elif metric == "pear_spear":
        return pearson_and_spearman(preds, labels)
    # TODO this metric seems very specific for NER and doesnt work for
    elif metric == "seq_f1":
        return {"seq_f1": ner_f1_score(labels, preds)}
    elif metric == "f1_macro":
        return f1_macro(preds, labels)
    elif metric == "squad":
        return squad(preds, labels)
    elif metric == "mse":
        return {"mse": mean_squared_error(preds, labels)}
    elif metric == "r2":
        return {"r2": r2_score(preds, labels)}
    elif metric == "top_n_accuracy":
        return {"top_n_accuracy": top_n_accuracy(preds, labels)}
    elif metric == "text_similarity_metric":
        return text_similarity_metric(preds, labels)
    # elif metric == "masked_accuracy":
    #     return simple_accuracy(preds, labels, ignore=-1)
    elif isinstance(metric, list):
        ret = {}
        for m in metric:
            ret.update(compute_metrics(m, preds, labels))
        return ret
    elif metric in registered_metrics:
        metric_func = registered_metrics[metric]
        return metric_func(preds, labels)
    else:
        raise KeyError(metric)
Example #4
0
def evaluation(ctx, data_loader, net, intent_pred_loss, slot_pred_loss, slot_vocab):
    """

    Parameters
    ----------
    ctx : Context
    data_loader : DataLoader
    net : Block
    intent_pred_loss : Loss
    slot_pred_loss : Loss
    slot_vocab : Vocab

    Returns
    -------
    avg_intent_loss : float
    avg_slot_loss : float
    intent_acc : float
    slot_f1 : float
    pred_slots : list
    gt_slots : list
    """
    nsample = 0
    nslot = 0
    avg_intent_loss = 0
    avg_slot_loss = 0
    correct_intent = 0
    pred_slots = []
    gt_slots = []
    for token_ids, mask, selected, slot_ids, intent_label, valid_length in data_loader:
        token_ids = mx.nd.array(token_ids, ctx=ctx).astype(np.int32)
        mask = mx.nd.array(mask, ctx=ctx).astype(np.float32)
        slot_ids = mx.nd.array(slot_ids, ctx=ctx).astype(np.int32)
        intent_label = mx.nd.array(intent_label, ctx=ctx).astype(np.int32)
        valid_length = mx.nd.array(valid_length, ctx=ctx).astype(np.float32)
        batch_nslot = mask.sum().asscalar()
        batch_nsample = token_ids.shape[0]
        # Forward network
        intent_scores, slot_scores = net(token_ids, valid_length)
        intent_loss = intent_pred_loss(intent_scores, intent_label)
        slot_loss = slot_pred_loss(slot_scores, slot_ids, mask.expand_dims(axis=-1))
        avg_intent_loss += intent_loss.sum().asscalar()
        avg_slot_loss += slot_loss.sum().asscalar()
        pred_slot_ids = mx.nd.argmax(slot_scores, axis=-1).astype(np.int32)
        correct_intent += (mx.nd.argmax(intent_scores, axis=-1).astype(np.int32)
                           == intent_label).sum().asscalar()
        for i in range(batch_nsample):
            ele_valid_length = int(valid_length[i].asscalar())
            ele_sel = selected[i].asnumpy()[:ele_valid_length]
            ele_gt_slot_ids = slot_ids[i].asnumpy()[ele_sel]
            ele_pred_slot_ids = pred_slot_ids[i].asnumpy()[ele_sel]
            ele_gt_slot_tokens = [slot_vocab.idx_to_token[v] for v in ele_gt_slot_ids]
            ele_pred_slot_tokens = [slot_vocab.idx_to_token[v] for v in ele_pred_slot_ids]
            gt_slots.append(ele_gt_slot_tokens)
            pred_slots.append(ele_pred_slot_tokens)
        nsample += batch_nsample
        nslot += batch_nslot
    avg_intent_loss /= nsample
    avg_slot_loss /= nslot
    intent_acc = correct_intent / float(nsample)
    slot_f1 = ner_f1_score(pred_slots, gt_slots)
    return avg_intent_loss, avg_slot_loss, intent_acc, slot_f1, pred_slots, gt_slots