def __init__(self): self.micro = results_pb2.Stats() self.macro = _MacroStats() # Map from info type name to Stats pb. self.per_type = collections.defaultdict(results_pb2.Stats) self.typeless_micro = results_pb2.Stats() self.typeless_macro = _MacroStats()
def testStrictMatching(self): finding = eval_lib.Finding findings = set([ finding('TYPE_A', 0, 3, 'one'), finding('TYPE_B', 5, 8, 'two'), finding('TYPE_C', 20, 25, 'three'), finding('TYPE_D', 30, 34, 'four') ]) golden_findings = set([ finding('TYPE_A', 0, 3, 'hit'), finding('TYPE_B', 7, 10, 'hit'), finding('TYPE_C', 25, 29, 'miss'), finding('TYPE_E', 30, 34, 'wrong type') ]) result = eval_lib.count_matches(findings, golden_findings, record_id='', strict=True, ignore_type=False) expected_stats = results_pb2.Stats() expected_stats.true_positives = 1 expected_stats.false_positives = 3 expected_stats.false_negatives = 3 expected_stats.precision = 0.25 expected_stats.recall = 0.25 expected_stats.f_score = 0.25 self.assertEqual(normalize_floats(expected_stats), normalize_floats(result.stats)) expected_typeless_stats = results_pb2.Stats() expected_typeless_stats.true_positives = 2 expected_typeless_stats.false_positives = 2 expected_typeless_stats.false_negatives = 2 expected_typeless_stats.precision = 0.5 expected_typeless_stats.recall = 0.5 expected_typeless_stats.f_score = 0.5 self.assertEqual(normalize_floats(expected_typeless_stats), normalize_floats(result.typeless)) a = results_pb2.Stats() a.true_positives = 1 b = results_pb2.Stats() b.false_positives = 1 b.false_negatives = 1 c = results_pb2.Stats() c.false_positives = 1 c.false_negatives = 1 d = results_pb2.Stats() d.false_positives = 1 e = results_pb2.Stats() e.false_negatives = 1 expected_per_type = { 'TYPE_A': a, 'TYPE_B': b, 'TYPE_C': c, 'TYPE_D': d, 'TYPE_E': e } self.assertEqual(expected_per_type, result.per_type)
def testCalculateStats(self): stats = results_pb2.Stats() stats.true_positives = 12 stats.false_positives = 8 stats.false_negatives = 3 eval_lib.calculate_stats(stats) self.assertAlmostEqual(.6, stats.precision) self.assertAlmostEqual(.8, stats.recall) self.assertAlmostEqual(.6857142857142856, stats.f_score) stats = results_pb2.Stats() eval_lib.calculate_stats(stats) self.assertTrue(math.isnan(stats.precision)) self.assertTrue(math.isnan(stats.recall)) self.assertTrue(math.isnan(stats.f_score)) self.assertEqual( 'Precision has denominator of zero. Recall has denominator of zero. ' 'f-score is NaN', stats.error_message)
def testIntervalsCountNotExactMatch(self): finding = eval_lib.Finding findings = set([ finding('NAME', 1, 8, 'he quic'), # Golden contains. finding('NAME', 10, 19, 'brown fox'), # Golden contained. finding('NAME', 20, 30, 'jumps over') # Intersection. ]) golden_findings = set([ finding('NAME', 0, 9, 'The quick'), # Golden contains. finding('NAME', 11, 18, 'rown fo'), # Golden contained. finding('NAME', 26, 34, 'over the') # Intersection. ]) result = eval_lib.intervals_count_compare(findings, golden_findings, record_id='') expected_typeless = results_pb2.Stats() expected_typeless.true_positives = 3 expected_typeless.false_positives = 3 expected_typeless.false_negatives = 3 expected_typeless.precision = 0.5 expected_typeless.recall = 0.5 expected_typeless.f_score = 0.5 self.assertEqual(normalize_floats(expected_typeless), normalize_floats(result.typeless)) expected_total = results_pb2.Stats() expected_total.true_positives = 3 expected_total.false_positives = 3 expected_total.false_negatives = 3 expected_total.precision = 0.5 expected_total.recall = 0.5 expected_total.f_score = 0.5 self.assertEqual(normalize_floats(expected_total), normalize_floats(result.stats)) expected_name = results_pb2.Stats() expected_name.true_positives = 3 expected_name.false_positives = 3 expected_name.false_negatives = 3 expected_per_type = {'NAME': expected_name} self.assertEqual(expected_per_type, result.per_type)
def testCharactersCountIgnoringNonAlphanumerics(self): finding = eval_lib.Finding findings = set([ finding('NAME', 0, 9, 'The quick'), finding('ID', 10, 19, 'brown fox'), finding('ORGANIZATION', 20, 30, 'jumps over') ]) golden_findings = set([ finding('NAME', 0, 9, 'The quick'), finding('AGE', 10, 19, 'brown fox'), finding('DATE', 35, 43, 'lazy dog') ]) result = eval_lib.characters_count_compare( findings, golden_findings, record_id='', ignore_nonalphanumerics=True) expected_typeless = results_pb2.Stats() expected_typeless.true_positives = 16 expected_typeless.false_positives = 9 expected_typeless.false_negatives = 7 expected_typeless.precision = 0.64 expected_typeless.recall = 0.695652 expected_typeless.f_score = 0.666667 self.assertEqual(normalize_floats(expected_typeless), normalize_floats(result.typeless)) expected_total = results_pb2.Stats() expected_total.true_positives = 8 expected_total.false_positives = 17 expected_total.false_negatives = 15 expected_total.precision = 0.32 expected_total.recall = 0.347826 expected_total.f_score = 0.333333 self.assertEqual(normalize_floats(expected_total), normalize_floats(result.stats)) expected_name = results_pb2.Stats() expected_name.true_positives = 8 expected_id = results_pb2.Stats() expected_id.false_positives = 8 expected_age = results_pb2.Stats() expected_age.false_negatives = 8 expected_org = results_pb2.Stats() expected_org.false_positives = 9 expected_date = results_pb2.Stats() expected_date.false_negatives = 7 expected_per_type = { 'NAME': expected_name, 'ID': expected_id, 'AGE': expected_age, 'ORGANIZATION': expected_org, 'DATE': expected_date } self.assertEqual(expected_per_type, result.per_type)
def testTypedTokensCount(self): finding = eval_lib.Finding findings = set([ finding('NAME', 0, 9, 'The quick'), finding('ID', 10, 19, 'brown fox'), finding('ORGANIZATION', 20, 30, 'jumps over') ]) golden_findings = set([ finding('NAME', 0, 9, 'The quick'), finding('AGE', 10, 19, 'brown fox'), finding('DATE', 35, 43, 'lazy dog') ]) result = eval_lib.typed_token_compare(findings, golden_findings, record_id='') expected_typeless = results_pb2.Stats() expected_typeless.true_positives = 4 expected_typeless.false_positives = 2 expected_typeless.false_negatives = 2 expected_typeless.precision = 0.666667 expected_typeless.recall = 0.666667 expected_typeless.f_score = 0.666667 self.assertEqual(normalize_floats(expected_typeless), normalize_floats(result.typeless)) expected_total = results_pb2.Stats() expected_total.true_positives = 2 expected_total.false_positives = 4 expected_total.false_negatives = 4 expected_total.precision = 0.333333 expected_total.recall = 0.333333 expected_total.f_score = 0.333333 self.assertEqual(normalize_floats(expected_total), normalize_floats(result.stats)) expected_name = results_pb2.Stats() expected_name.true_positives = 2 expected_id = results_pb2.Stats() expected_id.false_positives = 2 expected_age = results_pb2.Stats() expected_age.false_negatives = 2 expected_org = results_pb2.Stats() expected_org.false_positives = 2 expected_date = results_pb2.Stats() expected_date.false_negatives = 2 expected_per_type = { 'NAME': expected_name, 'ID': expected_id, 'AGE': expected_age, 'ORGANIZATION': expected_org, 'DATE': expected_date } self.assertEqual(expected_per_type, result.per_type)
def calculate_stats(self): """Generate a resuts_pb2.Stats message with the macro-averaged results.""" stats = results_pb2.Stats() if not self.count: stats.precision = float('NaN') stats.recall = float('NaN') stats.f_score = float('NaN') stats.error_message = 'Averaging over zero results.' return stats stats.precision = float(self.precision_sum) / self.count stats.recall = float(self.recall_sum) / self.count stats.f_score = hmean(stats.precision, stats.recall) stats.error_message = self.error_message return stats
def __init__(self): self.record_id = '' self.stats = results_pb2.Stats() self.per_type = collections.defaultdict(results_pb2.Stats) self.typeless = results_pb2.Stats() self.debug_info = []
def testAccumulateResults(self): result1 = eval_lib.IndividualResult() result1.stats.true_positives = 30 result1.stats.false_positives = 20 result1.stats.false_negatives = 10 result1.per_type['TypeA'].true_positives = 9 result1.per_type['TypeA'].false_positives = 8 result1.per_type['TypeA'].false_negatives = 7 result1.per_type['TypeB'].true_positives = 6 result1.per_type['TypeB'].false_positives = 5 result1.per_type['TypeB'].false_negatives = 4 result1.typeless.true_positives = 15 result1.typeless.false_positives = 14 result1.typeless.false_negatives = 13 eval_lib.calculate_stats(result1.stats) eval_lib.calculate_stats(result1.typeless) result2 = eval_lib.IndividualResult() result2.stats.true_positives = 3 result2.stats.false_positives = 2 result2.stats.false_negatives = 1 result2.per_type['TypeA'].true_positives = 19 result2.per_type['TypeA'].false_positives = 18 result2.per_type['TypeA'].false_negatives = 17 result2.per_type['TypeB'].true_positives = 16 result2.per_type['TypeB'].false_positives = 15 result2.per_type['TypeB'].false_negatives = 14 result2.typeless.true_positives = 13 result2.typeless.false_positives = 12 result2.typeless.false_negatives = 11 eval_lib.calculate_stats(result2.stats) eval_lib.calculate_stats(result2.typeless) ar = eval_lib.AccumulatedResults() ar.add_result(result1) ar.add_result(result2) expected_micro = results_pb2.Stats() expected_micro.true_positives = 33 expected_micro.false_positives = 22 expected_micro.false_negatives = 11 self.assertEqual(expected_micro, ar.micro) expected_macro = results_pb2.Stats() expected_macro.precision = 0.6 expected_macro.recall = 0.75 expected_macro.f_score = 0.666667 self.assertEqual(normalize_floats(expected_macro), normalize_floats(ar.macro.calculate_stats())) expected_type_a = results_pb2.Stats() expected_type_a.true_positives = 28 expected_type_a.false_positives = 26 expected_type_a.false_negatives = 24 expected_type_b = results_pb2.Stats() expected_type_b.true_positives = 22 expected_type_b.false_positives = 20 expected_type_b.false_negatives = 18 expected_per_type = { 'TypeA': expected_type_a, 'TypeB': expected_type_b } self.assertEqual(expected_per_type, ar.per_type) expected_typeless_micro = results_pb2.Stats() expected_typeless_micro.true_positives = 28 expected_typeless_micro.false_positives = 26 expected_typeless_micro.false_negatives = 24 self.assertEqual(expected_typeless_micro, ar.typeless_micro) expected_typeless_macro = results_pb2.Stats() expected_typeless_macro.precision = 0.518621 expected_typeless_macro.recall = 0.53869 expected_typeless_macro.f_score = 0.528465 self.assertEqual(normalize_floats(expected_typeless_macro), normalize_floats(ar.typeless_macro.calculate_stats()))