Ejemplo n.º 1
0
def main():
  logging.getLogger().setLevel(logging.INFO)

  parser = argparse.ArgumentParser(
      description='Evaluate DeID findings on Google Cloud.')
  run_pipeline_lib.add_all_args(parser)
  args, pipeline_args = parser.parse_known_args(sys.argv[1:])

  errors = run_pipeline_lib.run_pipeline(
      args.mae_input_pattern, args.mae_golden_dir, args.results_dir,
      args.mae_input_query, args.mae_golden_table,
      args.write_per_note_stats_to_gcs, args.results_table,
      args.per_note_results_table, args.debug_output_table,
      args.types_to_ignore or [], None, pipeline_args)

  if errors:
    logging.error(errors)
    return 1

  logging.info('Ran eval.')
Ejemplo n.º 2
0
def evaluate():
  """Run evaluation pipeline."""
  if flask.request.method == 'GET':
    jobs, offset = model.get_list(model.EvalJobTable)
    return flask.jsonify(jobs=jobs, offset=offset), 200

  # Process POST requests.
  try:
    jsonschema.validate(flask.request.json, eval_pipeline_shema)
  except jsonschema.ValidationError:
    error_msg = 'unable to validate provided payload.'
    return flask.jsonify(error=400, text=error_msg), 400

  (mae_input_pattern, mae_golden_dir, results_dir, mae_input_query,
   mae_golden_table, write_per_note_stats_to_gcs, results_table,
   per_note_results_table, debug_output_table, types_to_ignore) = (
       None, None, None, None, None, None, None, None, None, None)

  job_data = {
      'name': flask.request.json['name'],
      'timestamp': datetime.utcnow(),
  }

  # Get input info
  input_json = flask.request.json['input']
  gcs_input, bq_input = input_json.get('gcs'), input_json.get('bigquery')
  if gcs_input:

    mae_input_pattern = job_data['findings'] = gcs_input['pattern'] + '*.xml'
    mae_golden_dir = job_data['goldens'] = gcs_input['golden']
  if bq_input:
    job_data['findings'] = bq_input['query']
    mae_input_query = append_project(job_data['findings'])
    job_data['goldens'] = bq_input['golden']
    mae_golden_table = append_project(job_data['goldens'])
    try:
      findings_dataset, findings_table = job_data['findings'].split('.')
      get_bq_table(findings_dataset, findings_table)
      golden_dataset, golden_table = job_data['golden'].split('.')
      get_bq_table(golden_dataset, golden_table)
    except exceptions.NotFound:
      error_msg = 'unable to locate input BigQuery tables'
      return flask.jsonify(error=400, text=error_msg), 400

  # Get output info
  output_json = flask.request.json['output']
  gcs_output, bq_output = output_json.get('gcs'), output_json.get('bigquery')
  if gcs_output:
    results_dir = job_data['stats'] = gcs_output['dir']
    write_per_note_stats_to_gcs = gcs_output['debug']
    if write_per_note_stats_to_gcs:
      job_data['debug'] = gcs_output['dir']
  if bq_output:
    job_data['stats'] = bq_output['stats']
    results_table = append_project(job_data['stats'])
    job_data['debug'] = bq_output['debug']
    debug_output_table = append_project(job_data['debug'])
    if bq_output.get('perNote'):
      per_note_results_table = append_project(bq_output.get('perNote'))

  # Get types to ignore
  types_to_ignore = flask.request.json.get('ignoreTypes') or []

  # Get pipeline args
  pipeline_args = []

  eval_job = model.create(model.EvalJobTable, job_data)
  errors = eval_lib.run_pipeline(mae_input_pattern, mae_golden_dir, results_dir,
                                 mae_input_query, mae_golden_table,
                                 write_per_note_stats_to_gcs, results_table,
                                 per_note_results_table, debug_output_table,
                                 types_to_ignore, eval_job.timestamp,
                                 pipeline_args)

  if errors:
    eval_job.update(status=400, log_trace=errors)
    return flask.jsonify(error=400, text=errors), 400

  eval_job.update(status=200)
  return flask.jsonify(result='success'), 200
    def testE2eBigquery(self, mock_bq_source_fn, mock_bq_sink_fn,
                        mock_utcnow_fn):
        def make_sink(table_name, schema, write_disposition):  # pylint: disable=unused-argument
            return beam_testutil.FakeSink(table_name)

        mock_bq_sink_fn.side_effect = make_sink
        now = 'current time'
        mock_utcnow_fn.return_value = now

        tp_tag = tag_template.format('TypeA', 0, 5)
        fp_tag = tag_template.format('TypeA', 8, 10)
        fn_tag = tag_template.format('TypeA', 11, 13)
        fn2_tag = tag_template.format('TypeA', 15, 19)
        findings_tags = '\n'.join([tp_tag, fp_tag])
        golden_tags = '\n'.join([tp_tag, fn_tag, fn2_tag])

        mock_bq_source_fn.return_value = beam_testutil.FakeSource()
        mock_bq_source_fn.return_value._records = [{
            'findings_record_id':
            '111-1',
            'findings_xml':
            xml_template.format(findings_tags),
            'golden_xml':
            xml_template.format(golden_tags)
        }]

        types_to_ignore = ['ignore']
        # These features are tested in testE2eGCS.
        input_pattern, golden_dir, results_dir, per_note_table, debug_table = (
            None, None, None, None, None)
        mae_input_query = 'SELECT * from [project.dataset.table]'
        mae_golden_table = 'project.dataset.golden_table'
        run_pipeline_lib.run_pipeline(input_pattern,
                                      golden_dir,
                                      results_dir,
                                      mae_input_query,
                                      mae_golden_table,
                                      False,
                                      'results_table',
                                      per_note_table,
                                      debug_table,
                                      types_to_ignore,
                                      pipeline_args=None)

        # Check that we generated the query correctly.
        mock_bq_source_fn.assert_called_with(
            query=('SELECT findings.record_id, findings.xml, golden.xml FROM '
                   '(SELECT * from [project.dataset.table]) AS findings '
                   'LEFT JOIN [project.dataset.golden_table] AS golden '
                   'ON findings.record_id=golden.record_id'))

        # Check we wrote the correct results to BigQuery.
        expected_results = [{
            'info_type': 'ALL',
            'recall': 0.333333,
            'precision': 0.5,
            'f_score': 0.4,
            'true_positives': 1,
            'false_positives': 1,
            'false_negatives': 2
        }, {
            'info_type': u'TypeA',
            'recall': 0.333333,
            'precision': 0.5,
            'f_score': 0.4,
            'true_positives': 1,
            'false_positives': 1,
            'false_negatives': 2
        }]
        for r in expected_results:
            r.update({'timestamp': now})
        actual_results = sorted(beam_testutil.get_table('results_table'),
                                key=lambda x: x['info_type'])
        self.assertEqual([normalize_dict_floats(r) for r in expected_results],
                         [normalize_dict_floats(r) for r in actual_results])
    def testE2eGCS(self, fake_client_fn, mock_bq_sink_fn, mock_utcnow_fn):
        def make_sink(table_name, schema, write_disposition):  # pylint: disable=unused-argument
            return beam_testutil.FakeSink(table_name)

        mock_bq_sink_fn.side_effect = make_sink
        now = 'current time'
        mock_utcnow_fn.return_value = now

        input_pattern = 'gs://bucketname/input/*'
        golden_dir = 'gs://bucketname/goldens'
        results_dir = 'gs://bucketname/results'
        storage_client = testutil.FakeStorageClient()
        fake_client_fn.return_value = storage_client

        tp_tag = tag_template.format('TypeA', 0, 5)
        fp_tag = tag_template.format('TypeA', 8, 10)
        fn_tag = tag_template.format('TypeA', 11, 13)
        fn2_tag = tag_template.format('TypeA', 15, 19)
        findings_tags = '\n'.join([tp_tag, fp_tag])
        golden_tags = '\n'.join([tp_tag, fn_tag, fn2_tag])
        testutil.set_gcs_file('bucketname/input/1-1.xml',
                              xml_template.format(findings_tags))
        testutil.set_gcs_file('bucketname/goldens/1-1.xml',
                              xml_template.format(golden_tags))

        tp2_tag = tag_template.format('TypeB', 20, 21)
        # False negative + false positive for entity matching, but true positive for
        # binary token matching.
        entity_fp_tag = tag_template.format('TypeX', 30, 35)
        entity_fn_tag = tag_template.format('TypeY', 30, 35)
        # Two tokens are tagged as one in the golden. This is not a match for entity
        # matching, but is two matches for binary token matching.
        partial_tag1 = tag_template.format('TypeA', 36, 41)
        partial_tag2 = tag_template.format('TypeA', 42, 47)
        partial_tag3 = tag_template.format('TypeA', 48, 54)
        multi_token_tag = tag_template.format('TypeA', 36, 54)
        ignored_tag = tag_template.format('ignore', 55, 57)
        findings_tags = '\n'.join([
            tp_tag, tp2_tag, entity_fp_tag, partial_tag1, partial_tag2,
            partial_tag3, ignored_tag
        ])
        golden_tags = '\n'.join(
            [tp_tag, tp2_tag, entity_fn_tag, multi_token_tag])
        testutil.set_gcs_file('bucketname/input/1-2.xml',
                              xml_template.format(findings_tags))
        testutil.set_gcs_file('bucketname/goldens/1-2.xml',
                              xml_template.format(golden_tags))
        self.old_write_to_text = beam.io.WriteToText
        beam.io.WriteToText = beam_testutil.DummyWriteTransform
        types_to_ignore = ['ignore']
        mae_input_query = None
        mae_golden_table = None
        run_pipeline_lib.run_pipeline(input_pattern,
                                      golden_dir,
                                      results_dir,
                                      mae_input_query,
                                      mae_golden_table,
                                      True,
                                      'results_table',
                                      'per_note_results_table',
                                      'debug_output_table',
                                      types_to_ignore,
                                      pipeline_args=None)
        beam.io.WriteToText = self.old_write_to_text

        # Check we wrote the correct results to BigQuery.
        expected_results = [{
            'info_type': 'ALL',
            'recall': 0.7777777777777778,
            'precision': 0.875,
            'f_score': 0.823529411764706,
            'true_positives': 7,
            'false_positives': 1,
            'false_negatives': 2
        }, {
            'info_type': u'TypeA',
            'recall': 0.7142857142857143,
            'precision': 0.8333333333333334,
            'f_score': 0.7692307692307694,
            'true_positives': 5,
            'false_positives': 1,
            'false_negatives': 2
        }, {
            'info_type': u'TypeB',
            'recall': 1.0,
            'precision': 1.0,
            'f_score': 1.0,
            'true_positives': 1,
            'false_positives': 0,
            'false_negatives': 0
        }, {
            'info_type': u'TypeY',
            'recall': 1.0,
            'precision': 1.0,
            'f_score': 1.0,
            'true_positives': 1,
            'false_positives': 0,
            'false_negatives': 0
        }]
        for r in expected_results:
            r.update({'timestamp': now})
        actual_results = sorted(beam_testutil.get_table('results_table'),
                                key=lambda x: x['info_type'])
        self.assertEqual([normalize_dict_floats(r) for r in expected_results],
                         [normalize_dict_floats(r) for r in actual_results])

        full_text = 'word1   w2 w3  wrd4 5 word6   word7 multi token entity w8'

        def debug_info(record_id, classification, text, info_type, start, end):
            location = full_text.find(text)
            context = (full_text[0:location] + '{[--' + text + '--]}' +
                       full_text[location + len(text):])
            return {
                'record_id': record_id,
                'classification': classification,
                'text': text,
                'info_type': info_type,
                'context': context,
                'start': start,
                'end': end
            }

        expected_debug_info = [
            debug_info('1-1', 'true_positive', 'word1', 'TypeA', 0, 5),
            debug_info('1-1', 'false_positive', 'w2', 'TypeA', 8, 10),
            debug_info('1-1', 'false_negative', 'w3', 'TypeA', 11, 13),
            debug_info('1-1', 'false_negative', 'wrd4', 'TypeA', 15, 19),
            debug_info('1-2', 'true_positive', 'word1', 'TypeA', 0, 5),
            debug_info('1-2', 'true_positive', '5', 'TypeB', 20, 21),
            debug_info('1-2', 'true_positive', 'word7', 'TypeY', 30, 35),
            debug_info('1-2', 'true_positive', 'multi', 'TypeA', 36, 41),
            debug_info('1-2', 'true_positive', 'token', 'TypeA', 42, 47),
            debug_info('1-2', 'true_positive', 'entity', 'TypeA', 48, 54),
        ]
        for r in expected_debug_info:
            r.update({'timestamp': now})

        def s(l):
            return sorted(l, key=lambda x: x['record_id'] + x['context'])

        self.assertEqual(s(expected_debug_info),
                         s(beam_testutil.get_table('debug_output_table')))

        expected_per_note = [{
            'record_id': '1-1',
            'precision': 0.5,
            'recall': 0.3333333333333333,
            'f_score': 0.4,
            'true_positives': 1,
            'false_positives': 1,
            'false_negatives': 2
        }, {
            'record_id': '1-2',
            'precision': 1.0,
            'recall': 1.0,
            'f_score': 1.0,
            'true_positives': 6,
            'false_positives': 0,
            'false_negatives': 0
        }]
        for r in expected_per_note:
            r.update({'timestamp': now})
        actual_results = sorted(
            beam_testutil.get_table('per_note_results_table'),
            key=lambda x: x['record_id'])
        self.assertEqual([normalize_dict_floats(r) for r in expected_per_note],
                         [normalize_dict_floats(r) for r in actual_results])

        # Check we wrote the correct results to GCS.
        expected_text = ''
        with open(os.path.join(TESTDATA_DIR, 'expected_results')) as f:
            expected_text = f.read()
        expected_results = results_pb2.Results()
        text_format.Merge(expected_text, expected_results)
        results = results_pb2.Results()
        text_format.Merge(
            testutil.get_gcs_file('bucketname/results/aggregate_results.txt'),
            results)
        self.assertEqual(normalize_floats(expected_results),
                         normalize_floats(results))

        # Check the per-file results were written correctly.
        expected_result1 = results_pb2.IndividualResult()
        text_format.Merge(
            """
record_id: "1-1"
stats {
  true_positives: 1
  false_positives: 1
  false_negatives: 2
  precision: 0.5
  recall: 0.333333333333
  f_score: 0.4
}""", expected_result1)
        expected_result2 = results_pb2.IndividualResult()
        text_format.Merge(
            """
record_id: "1-2"
stats {
  true_positives: 6
  precision: 1.0
  recall: 1.0
  f_score: 1.0
}""", expected_result2)
        normalize_floats(expected_result1)
        normalize_floats(expected_result2)
        full_text = testutil.get_gcs_file(
            'bucketname/results/per-note-results')
        actual_results = []
        for record in sorted(full_text.split('\n\n')):
            if not record:
                continue
            actual_result = results_pb2.IndividualResult()
            text_format.Merge(record, actual_result)
            actual_results.append(normalize_floats(actual_result))

        self.assertEqual([expected_result1, expected_result2], actual_results)