def get_rollup_fscore_on_pii(
    scores: List[TextScore], fbeta: float, recall_threshold: Optional[float]
) -> float:
    """Calculate f score on PII recognition.

    A single score, f score, will be calculate to indicate how a system did on
    predicting PII entities. Recall thresholding is supported, if the system can
    recognise a certain portion of an entity greater than the threshold, that
    entity then will be considered identified.

    Args:
        scores: a list of text scores providing info including precisions and recalls.
        fbeta: beta value for f score.
        recall_threshold: a float between 0 and 1. Any recall value that is greater
            than or equals to the threshold would be rounded up to 1.

    Returns:
        A f score represents performance of a system.
    """
    fscores = []
    for text_score in scores:
        precisions = [p.precision for p in text_score.precisions]
        recalls = [r.recall for r in text_score.recalls]
        f = compute_pii_detection_fscore(precisions, recalls, recall_threshold, fbeta)
        fscores.append(f)

    if fscores:
        return round(sum(fscores) / len(fscores), 4)
    else:
        # The only possibility to have empty fscores is that argument "scores"
        # is empty. In this case, we assign f score to 0.
        return 0.0
def get_rollup_metrics_on_types(
    grouped_labels: List[Set[str]], scores: List[TextScore], fbeta: float,
) -> Dict[FrozenSet[str], Dict[str, Union[float, str]]]:
    """Calculate f1, average precision and average recall for every group in the
    grouped labels.
    """
    score_table = regroup_scores_on_types(grouped_labels, scores)

    metrics = dict()
    for key, value in score_table.items():
        f1 = round(
            compute_pii_detection_fscore(
                value["precisions"], value["recalls"], beta=fbeta
            ),
            4,
        )

        if value["precisions"]:
            ave_precision = round(
                sum(value["precisions"]) / len(value["precisions"]), 4
            )
        else:
            ave_precision = "undefined"

        if value["recalls"]:
            ave_recall = round(sum(value["recalls"]) / len(value["recalls"]), 4)
        else:
            ave_recall = "undefined"

        metrics.update(
            {key: {"f1": f1, "ave-precision": ave_precision, "ave-recall": ave_recall}}
        )
    return metrics
Esempio n. 3
0
def test_compute_pii_detection_fscore_with_recall_threshold():
    precisions = [0.4, 0.8, 0.9]
    recalls = [0.2, 0.51, 0.7]
    actual = compute_pii_detection_fscore(precisions,
                                          recalls,
                                          recall_threshold=0.5)
    assert_almost_equal(actual, 0.716279)
Esempio n. 4
0
def test_compute_pii_detection_fscore_for_empty_precisions_recalls():
    actual = compute_pii_detection_fscore([], [])
    assert actual == 1.0
Esempio n. 5
0
def test_compute_pii_detection_fscore_for_empty_recalls():
    actual = compute_pii_detection_fscore([0.0], [])
    assert actual == 0.0
Esempio n. 6
0
def test_compute_pii_detection_fscore_for_empty_precisions():
    actual = compute_pii_detection_fscore([], [0.0])
    assert actual == 0.0
Esempio n. 7
0
def test_compute_pii_detection_fscore_for_invalid_threshold():
    precisions = recalls = [0.0]
    with pytest.raises(ValueError) as err:
        compute_pii_detection_fscore(precisions, recalls, recall_threshold=2.0)
    assert str(err.value) == (
        "Invalid threshold! Recall threshold must between 0 and 1 but got 2.0")
Esempio n. 8
0
def test_compute_pii_detection_fscore_for_no_recall_threshold():
    precisions = [0.4, 0.8]
    recalls = [0.2, 0.7]
    actual = compute_pii_detection_fscore(precisions, recalls)
    assert_almost_equal(actual, 0.5142857)
Esempio n. 9
0
def test_compute_pii_detection_fscore_for_no_recall_threshold_fscore_is_zero():
    precisions = [0.0, 0.0]
    recalls = [0.0, 0.0]
    actual = compute_pii_detection_fscore(precisions, recalls)
    assert actual == 0.0