def main(): logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser( description='Evaluate DeID findings on Google Cloud.') run_pipeline_lib.add_all_args(parser) args, pipeline_args = parser.parse_known_args(sys.argv[1:]) errors = run_pipeline_lib.run_pipeline( args.mae_input_pattern, args.mae_golden_dir, args.results_dir, args.mae_input_query, args.mae_golden_table, args.write_per_note_stats_to_gcs, args.results_table, args.per_note_results_table, args.debug_output_table, args.types_to_ignore or [], None, pipeline_args) if errors: logging.error(errors) return 1 logging.info('Ran eval.')
def evaluate(): """Run evaluation pipeline.""" if flask.request.method == 'GET': jobs, offset = model.get_list(model.EvalJobTable) return flask.jsonify(jobs=jobs, offset=offset), 200 # Process POST requests. try: jsonschema.validate(flask.request.json, eval_pipeline_shema) except jsonschema.ValidationError: error_msg = 'unable to validate provided payload.' return flask.jsonify(error=400, text=error_msg), 400 (mae_input_pattern, mae_golden_dir, results_dir, mae_input_query, mae_golden_table, write_per_note_stats_to_gcs, results_table, per_note_results_table, debug_output_table, types_to_ignore) = ( None, None, None, None, None, None, None, None, None, None) job_data = { 'name': flask.request.json['name'], 'timestamp': datetime.utcnow(), } # Get input info input_json = flask.request.json['input'] gcs_input, bq_input = input_json.get('gcs'), input_json.get('bigquery') if gcs_input: mae_input_pattern = job_data['findings'] = gcs_input['pattern'] + '*.xml' mae_golden_dir = job_data['goldens'] = gcs_input['golden'] if bq_input: job_data['findings'] = bq_input['query'] mae_input_query = append_project(job_data['findings']) job_data['goldens'] = bq_input['golden'] mae_golden_table = append_project(job_data['goldens']) try: findings_dataset, findings_table = job_data['findings'].split('.') get_bq_table(findings_dataset, findings_table) golden_dataset, golden_table = job_data['golden'].split('.') get_bq_table(golden_dataset, golden_table) except exceptions.NotFound: error_msg = 'unable to locate input BigQuery tables' return flask.jsonify(error=400, text=error_msg), 400 # Get output info output_json = flask.request.json['output'] gcs_output, bq_output = output_json.get('gcs'), output_json.get('bigquery') if gcs_output: results_dir = job_data['stats'] = gcs_output['dir'] write_per_note_stats_to_gcs = gcs_output['debug'] if write_per_note_stats_to_gcs: job_data['debug'] = gcs_output['dir'] if bq_output: job_data['stats'] = bq_output['stats'] results_table = append_project(job_data['stats']) job_data['debug'] = bq_output['debug'] debug_output_table = append_project(job_data['debug']) if bq_output.get('perNote'): per_note_results_table = append_project(bq_output.get('perNote')) # Get types to ignore types_to_ignore = flask.request.json.get('ignoreTypes') or [] # Get pipeline args pipeline_args = [] eval_job = model.create(model.EvalJobTable, job_data) errors = eval_lib.run_pipeline(mae_input_pattern, mae_golden_dir, results_dir, mae_input_query, mae_golden_table, write_per_note_stats_to_gcs, results_table, per_note_results_table, debug_output_table, types_to_ignore, eval_job.timestamp, pipeline_args) if errors: eval_job.update(status=400, log_trace=errors) return flask.jsonify(error=400, text=errors), 400 eval_job.update(status=200) return flask.jsonify(result='success'), 200
def testE2eBigquery(self, mock_bq_source_fn, mock_bq_sink_fn, mock_utcnow_fn): def make_sink(table_name, schema, write_disposition): # pylint: disable=unused-argument return beam_testutil.FakeSink(table_name) mock_bq_sink_fn.side_effect = make_sink now = 'current time' mock_utcnow_fn.return_value = now tp_tag = tag_template.format('TypeA', 0, 5) fp_tag = tag_template.format('TypeA', 8, 10) fn_tag = tag_template.format('TypeA', 11, 13) fn2_tag = tag_template.format('TypeA', 15, 19) findings_tags = '\n'.join([tp_tag, fp_tag]) golden_tags = '\n'.join([tp_tag, fn_tag, fn2_tag]) mock_bq_source_fn.return_value = beam_testutil.FakeSource() mock_bq_source_fn.return_value._records = [{ 'findings_record_id': '111-1', 'findings_xml': xml_template.format(findings_tags), 'golden_xml': xml_template.format(golden_tags) }] types_to_ignore = ['ignore'] # These features are tested in testE2eGCS. input_pattern, golden_dir, results_dir, per_note_table, debug_table = ( None, None, None, None, None) mae_input_query = 'SELECT * from [project.dataset.table]' mae_golden_table = 'project.dataset.golden_table' run_pipeline_lib.run_pipeline(input_pattern, golden_dir, results_dir, mae_input_query, mae_golden_table, False, 'results_table', per_note_table, debug_table, types_to_ignore, pipeline_args=None) # Check that we generated the query correctly. mock_bq_source_fn.assert_called_with( query=('SELECT findings.record_id, findings.xml, golden.xml FROM ' '(SELECT * from [project.dataset.table]) AS findings ' 'LEFT JOIN [project.dataset.golden_table] AS golden ' 'ON findings.record_id=golden.record_id')) # Check we wrote the correct results to BigQuery. expected_results = [{ 'info_type': 'ALL', 'recall': 0.333333, 'precision': 0.5, 'f_score': 0.4, 'true_positives': 1, 'false_positives': 1, 'false_negatives': 2 }, { 'info_type': u'TypeA', 'recall': 0.333333, 'precision': 0.5, 'f_score': 0.4, 'true_positives': 1, 'false_positives': 1, 'false_negatives': 2 }] for r in expected_results: r.update({'timestamp': now}) actual_results = sorted(beam_testutil.get_table('results_table'), key=lambda x: x['info_type']) self.assertEqual([normalize_dict_floats(r) for r in expected_results], [normalize_dict_floats(r) for r in actual_results])
def testE2eGCS(self, fake_client_fn, mock_bq_sink_fn, mock_utcnow_fn): def make_sink(table_name, schema, write_disposition): # pylint: disable=unused-argument return beam_testutil.FakeSink(table_name) mock_bq_sink_fn.side_effect = make_sink now = 'current time' mock_utcnow_fn.return_value = now input_pattern = 'gs://bucketname/input/*' golden_dir = 'gs://bucketname/goldens' results_dir = 'gs://bucketname/results' storage_client = testutil.FakeStorageClient() fake_client_fn.return_value = storage_client tp_tag = tag_template.format('TypeA', 0, 5) fp_tag = tag_template.format('TypeA', 8, 10) fn_tag = tag_template.format('TypeA', 11, 13) fn2_tag = tag_template.format('TypeA', 15, 19) findings_tags = '\n'.join([tp_tag, fp_tag]) golden_tags = '\n'.join([tp_tag, fn_tag, fn2_tag]) testutil.set_gcs_file('bucketname/input/1-1.xml', xml_template.format(findings_tags)) testutil.set_gcs_file('bucketname/goldens/1-1.xml', xml_template.format(golden_tags)) tp2_tag = tag_template.format('TypeB', 20, 21) # False negative + false positive for entity matching, but true positive for # binary token matching. entity_fp_tag = tag_template.format('TypeX', 30, 35) entity_fn_tag = tag_template.format('TypeY', 30, 35) # Two tokens are tagged as one in the golden. This is not a match for entity # matching, but is two matches for binary token matching. partial_tag1 = tag_template.format('TypeA', 36, 41) partial_tag2 = tag_template.format('TypeA', 42, 47) partial_tag3 = tag_template.format('TypeA', 48, 54) multi_token_tag = tag_template.format('TypeA', 36, 54) ignored_tag = tag_template.format('ignore', 55, 57) findings_tags = '\n'.join([ tp_tag, tp2_tag, entity_fp_tag, partial_tag1, partial_tag2, partial_tag3, ignored_tag ]) golden_tags = '\n'.join( [tp_tag, tp2_tag, entity_fn_tag, multi_token_tag]) testutil.set_gcs_file('bucketname/input/1-2.xml', xml_template.format(findings_tags)) testutil.set_gcs_file('bucketname/goldens/1-2.xml', xml_template.format(golden_tags)) self.old_write_to_text = beam.io.WriteToText beam.io.WriteToText = beam_testutil.DummyWriteTransform types_to_ignore = ['ignore'] mae_input_query = None mae_golden_table = None run_pipeline_lib.run_pipeline(input_pattern, golden_dir, results_dir, mae_input_query, mae_golden_table, True, 'results_table', 'per_note_results_table', 'debug_output_table', types_to_ignore, pipeline_args=None) beam.io.WriteToText = self.old_write_to_text # Check we wrote the correct results to BigQuery. expected_results = [{ 'info_type': 'ALL', 'recall': 0.7777777777777778, 'precision': 0.875, 'f_score': 0.823529411764706, 'true_positives': 7, 'false_positives': 1, 'false_negatives': 2 }, { 'info_type': u'TypeA', 'recall': 0.7142857142857143, 'precision': 0.8333333333333334, 'f_score': 0.7692307692307694, 'true_positives': 5, 'false_positives': 1, 'false_negatives': 2 }, { 'info_type': u'TypeB', 'recall': 1.0, 'precision': 1.0, 'f_score': 1.0, 'true_positives': 1, 'false_positives': 0, 'false_negatives': 0 }, { 'info_type': u'TypeY', 'recall': 1.0, 'precision': 1.0, 'f_score': 1.0, 'true_positives': 1, 'false_positives': 0, 'false_negatives': 0 }] for r in expected_results: r.update({'timestamp': now}) actual_results = sorted(beam_testutil.get_table('results_table'), key=lambda x: x['info_type']) self.assertEqual([normalize_dict_floats(r) for r in expected_results], [normalize_dict_floats(r) for r in actual_results]) full_text = 'word1 w2 w3 wrd4 5 word6 word7 multi token entity w8' def debug_info(record_id, classification, text, info_type, start, end): location = full_text.find(text) context = (full_text[0:location] + '{[--' + text + '--]}' + full_text[location + len(text):]) return { 'record_id': record_id, 'classification': classification, 'text': text, 'info_type': info_type, 'context': context, 'start': start, 'end': end } expected_debug_info = [ debug_info('1-1', 'true_positive', 'word1', 'TypeA', 0, 5), debug_info('1-1', 'false_positive', 'w2', 'TypeA', 8, 10), debug_info('1-1', 'false_negative', 'w3', 'TypeA', 11, 13), debug_info('1-1', 'false_negative', 'wrd4', 'TypeA', 15, 19), debug_info('1-2', 'true_positive', 'word1', 'TypeA', 0, 5), debug_info('1-2', 'true_positive', '5', 'TypeB', 20, 21), debug_info('1-2', 'true_positive', 'word7', 'TypeY', 30, 35), debug_info('1-2', 'true_positive', 'multi', 'TypeA', 36, 41), debug_info('1-2', 'true_positive', 'token', 'TypeA', 42, 47), debug_info('1-2', 'true_positive', 'entity', 'TypeA', 48, 54), ] for r in expected_debug_info: r.update({'timestamp': now}) def s(l): return sorted(l, key=lambda x: x['record_id'] + x['context']) self.assertEqual(s(expected_debug_info), s(beam_testutil.get_table('debug_output_table'))) expected_per_note = [{ 'record_id': '1-1', 'precision': 0.5, 'recall': 0.3333333333333333, 'f_score': 0.4, 'true_positives': 1, 'false_positives': 1, 'false_negatives': 2 }, { 'record_id': '1-2', 'precision': 1.0, 'recall': 1.0, 'f_score': 1.0, 'true_positives': 6, 'false_positives': 0, 'false_negatives': 0 }] for r in expected_per_note: r.update({'timestamp': now}) actual_results = sorted( beam_testutil.get_table('per_note_results_table'), key=lambda x: x['record_id']) self.assertEqual([normalize_dict_floats(r) for r in expected_per_note], [normalize_dict_floats(r) for r in actual_results]) # Check we wrote the correct results to GCS. expected_text = '' with open(os.path.join(TESTDATA_DIR, 'expected_results')) as f: expected_text = f.read() expected_results = results_pb2.Results() text_format.Merge(expected_text, expected_results) results = results_pb2.Results() text_format.Merge( testutil.get_gcs_file('bucketname/results/aggregate_results.txt'), results) self.assertEqual(normalize_floats(expected_results), normalize_floats(results)) # Check the per-file results were written correctly. expected_result1 = results_pb2.IndividualResult() text_format.Merge( """ record_id: "1-1" stats { true_positives: 1 false_positives: 1 false_negatives: 2 precision: 0.5 recall: 0.333333333333 f_score: 0.4 }""", expected_result1) expected_result2 = results_pb2.IndividualResult() text_format.Merge( """ record_id: "1-2" stats { true_positives: 6 precision: 1.0 recall: 1.0 f_score: 1.0 }""", expected_result2) normalize_floats(expected_result1) normalize_floats(expected_result2) full_text = testutil.get_gcs_file( 'bucketname/results/per-note-results') actual_results = [] for record in sorted(full_text.split('\n\n')): if not record: continue actual_result = results_pb2.IndividualResult() text_format.Merge(record, actual_result) actual_results.append(normalize_floats(actual_result)) self.assertEqual([expected_result1, expected_result2], actual_results)