def reset(self): """Reset the internal state to reuse the evaluator.""" self.primary_eval = ProteinEvaluation() self.secondary_eval = ProteinMacroEvaluation() self.results = None self.gold_standard = None self.logger = logging.getLogger("ProteinEvaluator") self._dois = None
def helper_protein(self, result_items, std_items): self.evaluator = ProteinEvaluation(doi="test", fn=3) self.evaluator.evaluate(result_items, std_items, 3) self.assert_hits(self.evaluator.hits, tp=2, fp=1, fn=1, tn=0) p = 2/3.0 r = 2/3.0 self.assert_property("precision", p) self.assert_property("recall", r) self.assert_property("f_score", 2.0 * p * r / (p + r)) self.assert_property("p_at_full_r", None) self.assert_property("avrg_p", 1/1.0 * 1/3.0 + 2/3.0 * 1/3.0)
def test_macro_evaluation(self): protein_results = [ CalculationTests.random_protein_result() for i in range(50) ] self.evaluator = ProteinMacroEvaluation( ((i, r) for i, r in enumerate(protein_results)) ) N = len(protein_results) precision = sum(p.precision for p in protein_results) / N recall = sum(p.recall for p in protein_results) / N f_score = sum(p.f_score for p in protein_results) / N self.assert_property("precision", precision) self.assert_property("recall", recall) self.assert_property("f_score", f_score)
def setUp(self): RandomProteinEvaluation = \ ProteinMacroEvaluationTest.init_random_protein_evaluation_mock self.random_mocks = dict((i, RandomProteinEvaluation()) for i in range(10)) self.evaluator = ProteinMacroEvaluation(self.random_mocks)
class ProteinMacroEvaluationTest(CalculationAssertions): def setUp(self): RandomProteinEvaluation = \ ProteinMacroEvaluationTest.init_random_protein_evaluation_mock self.random_mocks = dict((i, RandomProteinEvaluation()) for i in range(10)) self.evaluator = ProteinMacroEvaluation(self.random_mocks) @staticmethod @patch('biocreative.evaluation.calculation.hits.Hits', spec=True) def init_random_protein_evaluation_mock(unused): mock = Mock(wraps=ProteinEvaluation()) for prop in C.PROTEIN_PROPERTIES: setattr(mock, prop, random()) for attr in C.HITS_ATTRIBUTES: setattr(mock.hits, attr, randint(0, 1000)) return mock def test_std_dev(self): for prop in C.PROTEIN_PROPERTIES: if prop != 'avrg_p': expected = ProteinMacroEvaluationTest.calculate_std_dev( [getattr(m, prop) for m in self.random_mocks.values()] ) received = self.evaluator.std_dev(prop) self.assert_values(prop, expected, received) def test_properties_except_hits(self): for prop in C.PROTEIN_PROPERTIES: if prop != 'avrg_p': expected = self.get_average_for(prop) self.assert_property(prop, expected) @patch('biocreative.evaluation.calculation.hits.Hits', spec=True) def test_hits(self, unused): expected = dict() for attr in C.HITS_ATTRIBUTES: expected[attr] = sum( getattr(mock.hits, attr) for mock in self.random_mocks.values() ) received = self.evaluator.hits for attr in C.HITS_ATTRIBUTES: self.assertEqual( getattr(received, attr), expected[attr], "%s hits don't match (received: %i, expected: %i)" % ( attr, getattr(received, attr), expected[attr] ) ) def test_average_for(self): expected = self.get_average_for('precision') received = self.evaluator._average_for('precision') self.assert_values('average_for', expected, received) def test_static_calculations(self): for kind in ('variation', 'variance', 'std_dev'): self.run_static_calc_test_for(kind) def run_static_calc_test_for(self, name): rnd_floats = [random() for i in range(10)] expected_fun = eval('ProteinMacroEvaluationTest.calculate_%s' % name) test_fun = eval('ProteinMacroEvaluation._%s' % name) expected = expected_fun(rnd_floats) received = test_fun(rnd_floats) self.assert_values(name, expected, received) def get_average_for(self, prop): return ProteinMacroEvaluationTest.calculate_average( [getattr(m, prop) for m in self.random_mocks.values()] ) @staticmethod def calculate_average(numbers): total = sum(numbers) return float(total) / len(numbers) @staticmethod def calculate_variation(numbers): average = ProteinMacroEvaluationTest.calculate_average(numbers) return sum((i - average)**2 for i in numbers) @staticmethod def calculate_variance(numbers): variation = ProteinMacroEvaluationTest.calculate_variation(numbers) return variation / float(len(numbers)) @staticmethod def calculate_std_dev(numbers): variance = ProteinMacroEvaluationTest.calculate_variance(numbers) return sqrt(variance)
class ProteinEvaluator(AbstractEvaluator): """Implementation of the evaluation process for INT and IPT.""" def reset(self): """Reset the internal state to reuse the evaluator.""" self.primary_eval = ProteinEvaluation() self.secondary_eval = ProteinMacroEvaluation() self.results = None self.gold_standard = None self.logger = logging.getLogger("ProteinEvaluator") self._dois = None def _prepare(self): """Prepare the instance for the evaluation run.""" assert len(self.results) == len(self.gold_standard), \ "the entries in the evaluation result and the gold standard " \ "do not match" self.primary_eval.set_fn(self.gold_standard.true_items()) self.logger.debug( "INT/IPT evaluation: %i GS annotations" % self.primary_eval.hits.fn ) def _process(self): """Process the result set.""" self._dois = self.results.keys() self._dois.sort() result_sizes = [ len(result_list) for result_list in self.results.values() ] max_rank_in_results = max(result_sizes) if len(result_sizes) else 0 self.logger.info("longest result set has %i annotations", max_rank_in_results) if self.cutoff and self.cutoff < max_rank_in_results: max_rank_in_results = self.cutoff for doi in list(self._dois): std_items = self.gold_standard[doi] result_doc = ProteinEvaluation(doi=doi, fn=len(std_items)) self.secondary_eval[doi] = result_doc for rank in range(max_rank_in_results): for doi in list(self._dois): self._process_doi(doi, rank) # Calculate & store the average P/R pair # at this rank over all documents (macro-averaging) self.secondary_eval.store_p_at_current_r() # Calculate & store the current P/R value # at this rank over all documents (micro-averaging) self.primary_eval.store_p_at_current_r() def _process_doi(self, doi, rank): """Evaluate the result at a given rank for a document.""" result_items = self.results[doi] std_items = self.gold_standard.get(doi) # special syntax for mocking try: item = result_items[rank] except IndexError: # no more results for this DOI self._dois.remove(doi) else: if item.confidence is not None and \ item.confidence < self.min_conf: self._dois.remove(doi) # confidence-base cutoff else: # evaluate the result at the current rank self.primary_eval.evaluate_item(item, std_items) self.secondary_eval[doi].evaluate_item(item, std_items) self.secondary_eval[doi].store_p_at_current_r()
class CalculationTests(CalculationAssertions): def test_article_auc_pr(self): self.evaluator = CalculationTests.simulate_article_evaluator( ArticleAucPrEvaluation, 3 ) self.assert_hits(self.evaluator.hits, tp=3, fp=7, fn=0, tn=0) self.assert_property("p_at_full_r", 3/8.0) recall_span = 1/3.0 self.assert_property( "auc_pr", (1/1.0 + 1/1.0)/2 * recall_span + (1/4.0 + 2/5.0)/2 * recall_span + (2/7.0 + 3/8.0)/2 * recall_span ) def test_article_mcc(self): self.evaluator = CalculationTests.simulate_article_evaluator( ArticleMccEvaluation, 0 ) self.assert_hits(self.evaluator.hits, tp=2, fp=2, fn=1, tn=5) self.assert_property("sensitivity", 2/3.0) # tp / (tp + fn) self.assert_property("specificity", 5/7.0) # tn / (tn + fp) self.assert_property("accuracy", 7/10.0) # (tp + tn) / sum(hits) self.assert_property( "mcc_score", (2*5 - 2*1) / sqrt(4*3*7*6) ) # (tp*tn - fp*fn) / sqrt(tp+fp * tp+fn * tn+fp * tn+fn) @staticmethod def simulate_article_evaluator(EvaluatorClass, fn_count): evaluator = EvaluatorClass(fn=fn_count) t = True f = False for result_item, std_item in [ (t, t), (f, f), (f, f), (t, f), (f, t), (f, f), (f, f), (t, t), (f, f), (t, f) ]: evaluator.evaluate(result_item, std_item, None) return evaluator def test_protein_for_normalizations(self): self.helper_protein(["A", "B", "C", "D"], ["A", "C", "D"]) def test_protein_for_pairs(self): A, B, C, D = ("a", "x"), ("b", "y"), ("c", "z"), ("d", "w") self.helper_protein([A, B, C, D], [A, C, D]) def helper_protein(self, result_items, std_items): self.evaluator = ProteinEvaluation(doi="test", fn=3) self.evaluator.evaluate(result_items, std_items, 3) self.assert_hits(self.evaluator.hits, tp=2, fp=1, fn=1, tn=0) p = 2/3.0 r = 2/3.0 self.assert_property("precision", p) self.assert_property("recall", r) self.assert_property("f_score", 2.0 * p * r / (p + r)) self.assert_property("p_at_full_r", None) self.assert_property("avrg_p", 1/1.0 * 1/3.0 + 2/3.0 * 1/3.0) def test_macro_evaluation(self): protein_results = [ CalculationTests.random_protein_result() for i in range(50) ] self.evaluator = ProteinMacroEvaluation( ((i, r) for i, r in enumerate(protein_results)) ) N = len(protein_results) precision = sum(p.precision for p in protein_results) / N recall = sum(p.recall for p in protein_results) / N f_score = sum(p.f_score for p in protein_results) / N self.assert_property("precision", precision) self.assert_property("recall", recall) self.assert_property("f_score", f_score) @staticmethod def random_protein_result(): results = list(set([randint(1, 100) for i in range(100)])) gold_standard = sample(range(1, 101), 10) evaluator = ProteinEvaluation(doi="test", fn=len(gold_standard)) evaluator.evaluate(results, gold_standard, 0) return evaluator