Example #1
0
def test_confusion_matrix_correct_metrics():
    from collections import Counter

    evaluated = [
        EvaluationResult(results=Counter({
            ('O', 'O'): 150,
            ('O', 'PERSON'): 30,
            ('O', 'COMPANY'): 30,
            ('PERSON', 'PERSON'): 40,
            ('COMPANY', 'COMPANY'): 40,
            ('PERSON', 'COMPANY'): 10,
            ('COMPANY', 'PERSON'): 10,
            ('PERSON', 'O'): 30,
            ('COMPANY', 'O'): 30
        }),
                         model_errors=None,
                         text=None)
    ]

    model = MockTokensModel(prediction=None,
                            entities_to_keep=['PERSON', 'COMPANY'])

    scores = model.calculate_score(evaluated, beta=2.5)

    assert scores.pii_precision == 0.625
    assert scores.pii_recall == 0.625
    assert scores.entity_recall_dict['PERSON'] == 0.5
    assert scores.entity_precision_dict['PERSON'] == 0.5
    assert scores.entity_recall_dict['COMPANY'] == 0.5
    assert scores.entity_precision_dict['COMPANY'] == 0.5
Example #2
0
def test_confusion_matrix_2_correct_metrics():
    from collections import Counter

    evaluated = [
        EvaluationResult(results=Counter({
            ('O', 'O'): 65467,
            ('O', 'ORG'): 4189,
            ('GPE', 'O'): 3370,
            ('PERSON', 'PERSON'): 2024,
            ('GPE', 'PERSON'): 1488,
            ('GPE', 'GPE'): 1033,
            ('O', 'GPE'): 964,
            ('ORG', 'ORG'): 914,
            ('O', 'PERSON'): 834,
            ('GPE', 'ORG'): 401,
            ('PERSON', 'ORG'): 35,
            ('PERSON', 'O'): 33,
            ('ORG', 'O'): 8,
            ('PERSON', 'GPE'): 5,
            ('ORG', 'PERSON'): 1
        }),
                         model_errors=None,
                         text=None)
    ]

    model = MockTokensModel(prediction=None)

    scores = model.calculate_score(evaluated, beta=2.5)

    pii_tp = evaluated[0].results[('PERSON', 'PERSON')] + \
             evaluated[0].results[('ORG', 'ORG')] + \
             evaluated[0].results[('GPE', 'GPE')] + \
             evaluated[0].results[('ORG', 'GPE')] + \
             evaluated[0].results[('ORG', 'PERSON')] + \
             evaluated[0].results[('GPE', 'ORG')] + \
             evaluated[0].results[('GPE', 'PERSON')] + \
             evaluated[0].results[('PERSON', 'GPE')] + \
             evaluated[0].results[('PERSON', 'ORG')]

    pii_fp = evaluated[0].results[('O', 'PERSON')] + \
             evaluated[0].results[('O', 'GPE')] + \
             evaluated[0].results[('O', 'ORG')]

    pii_fn = evaluated[0].results[('PERSON', 'O')] + \
             evaluated[0].results[('GPE', 'O')] + \
             evaluated[0].results[('ORG', 'O')]

    assert scores.pii_precision == pii_tp / (pii_tp + pii_fp)
    assert scores.pii_recall == pii_tp / (pii_tp + pii_fn)
Example #3
0
def test_evaluate_multiple_examples_ignore_entity_correct_statistics():
    prediction = ["O", "O", "O", "U-PERSON", "O", "U-TENNIS_PLAYER"]
    model = MockTokensModel(prediction=prediction,
                            labeling_scheme='BILOU',
                            entities_to_keep=['PERSON', 'TENNIS_PLAYER'])
    input_sample = InputSample("My name is Raphael or David",
                               masked=None,
                               spans=None)
    input_sample.tokens = ["My", "name", "is", "Raphael", "or", "David"]
    input_sample.tags = ["O", "O", "O", "U-PERSON", "O", "U-PERSON"]

    evaluated = model.evaluate_all(
        [input_sample, input_sample, input_sample, input_sample])
    scores = model.calculate_score(evaluated)
    assert scores.pii_precision == 1
    assert scores.pii_recall == 1