Esempio n. 1
0
    def testGenerateDtdGcs(self):
        deid_cfg = os.path.join(TESTDATA_DIR, 'testdata/config.json')
        dtd_dir = 'gs://dtd-dir'
        run_deid_lib.run_pipeline(None,
                                  None,
                                  None,
                                  None,
                                  None,
                                  None,
                                  deid_cfg,
                                  'InspectPhiTask',
                                  'fake-credentials',
                                  'project',
                                  testutil.FakeStorageClient,
                                  None,
                                  None,
                                  'dlp',
                                  batch_size=1,
                                  dtd_dir=dtd_dir,
                                  pipeline_args=None)

        with open(os.path.join(TESTDATA_DIR, 'mae_testdata',
                               'sample.dtd')) as f:
            self.assertEqual(
                testutil.get_gcs_file('dtd-dir/classification.dtd'), f.read())
Esempio n. 2
0
    def testGenerateDtdLocal(self):
        deid_cfg = os.path.join(TESTDATA_DIR, 'testdata/config.json')
        dtd_dir = tempfile.mkdtemp()
        run_deid_lib.run_pipeline(None,
                                  None,
                                  None,
                                  None,
                                  None,
                                  None,
                                  deid_cfg,
                                  'InspectPhiTask',
                                  'fake-credentials',
                                  'project',
                                  testutil.FakeStorageClient,
                                  None,
                                  None,
                                  'dlp',
                                  batch_size=1,
                                  dtd_dir=dtd_dir,
                                  pipeline_args=None)

        with open(os.path.join(TESTDATA_DIR, 'mae_testdata',
                               'sample.dtd')) as f:
            with open(os.path.join(dtd_dir,
                                   'classification.dtd')) as generated_dtd:
                self.assertEqual(generated_dtd.read(), f.read())
Esempio n. 3
0
    def testCSV(self, mock_w2t_fn, mock_build_fn):
        mock_w2t_fn.side_effect = partial(self.make_csv_output)

        deid_response = {
            'item': {
                'table': {
                    'rows': [{
                        'values': [{
                            'stringValue': 'deid_resp_val'
                        }]
                    }],
                    'headers': [{
                        'name': 'note'
                    }]
                }
            }
        }
        fake_content = Mock()
        fake_content.deidentify.return_value = Mock(execute=Mock(
            return_value=deid_response))
        fake_projects = Mock(content=Mock(return_value=fake_content))
        fake_dlp = Mock(projects=Mock(return_value=fake_projects))
        mock_build_fn.return_value = fake_dlp

        deid_cfg = os.path.join(TESTDATA_DIR, 'sample_deid_config.json')
        deid_cfg_json = run_deid_lib.parse_config_file(deid_cfg)
        input_csv = os.path.join(TESTDATA_DIR, 'testdata/input.csv')
        run_deid_lib.run_pipeline(None,
                                  None,
                                  None,
                                  None,
                                  None,
                                  None,
                                  deid_cfg_json,
                                  'InspectPhiTask',
                                  'project',
                                  testutil.FakeStorageClient,
                                  None,
                                  None,
                                  'dlp',
                                  batch_size=1,
                                  dtd_dir=None,
                                  input_csv=input_csv,
                                  output_csv='output-csv',
                                  timestamp=DEID_TIMESTAMP,
                                  pipeline_args=None)

        fake_content.deidentify.assert_called_once()
        self.assertEqual(
            testutil.get_gcs_file('output-csv').strip(),
            '222,1,deid_resp_val,' + TIMESTAMP_STRING)
Esempio n. 4
0
def main():
    logging.getLogger().setLevel(logging.INFO)

    parser = argparse.ArgumentParser(
        description='Run Data Loss Prevention (DLP) DeID on Google Cloud.')
    run_deid_lib.add_all_args(parser)
    args, pipeline_args = parser.parse_known_args(sys.argv[1:])

    var = 'GOOGLE_APPLICATION_CREDENTIALS'
    if var not in os.environ or not os.environ[var]:
        raise Exception('You must specify service account credentials in the '
                        'GOOGLE_APPLICATION_CREDENTIALS environment variable.')
    _, default_project = google.auth.default()

    # Parse --project and re-add it to the pipeline args, swapping it out for the
    # default if it's not set.
    project = args.project
    if not project:
        project = default_project
    pipeline_args += ['--project', project]

    bq_client = bigquery.Client(project)
    bq_config_fn = None
    if hasattr(bigquery.job, 'QueryJobConfig'):
        bq_config_fn = bigquery.job.QueryJobConfig

    if not args.deid_config_file:
        raise Exception('Must provide DeID Config.')
    deid_config_json = run_deid_lib.parse_config_file(args.deid_config_file)
    timestamp = datetime.utcnow()

    errors = run_deid_lib.run_pipeline(
        args.input_query, args.input_table, args.deid_table,
        args.findings_table, args.mae_dir, args.mae_table, deid_config_json,
        args.mae_task_name, project, storage.Client, bq_client, bq_config_fn,
        args.dlp_api_name, args.batch_size, args.dtd_dir, args.input_csv,
        args.output_csv, timestamp, pipeline_args)

    if errors:
        logging.error(errors)
        return 1

    logging.info('Ran DLP API DeID.')
Esempio n. 5
0
def deidentify():
  """run dlp pipeline."""
  if flask.request.method == 'GET':
    jobs, offset = model.get_list(model.DeidJobTable)
    result = [{
        'id': job['id'],
        'name': job['name'],
        'originalQuery': job['original_query'],
        'deidTable': job['deid_table'],
        'status': job['status'],
        'logTrace': job['log_trace'],
        'timestamp': job['timestamp'],
    } for job in jobs]
    return flask.jsonify(jobs=result, offset=offset), 200


  try:
    jsonschema.validate(flask.request.json, deid_schema)
  except jsonschema.ValidationError:
    error_msg = 'unable to validate provided payload.'
    return flask.jsonify(error=400, text=error_msg), 400

  job_data = {
      'name': flask.request.json['name'],
      'timestamp': datetime.utcnow(),
  }
  (input_query, input_table, deid_table, findings_table, mae_dir, mae_table,
   mae_task_name, batch_size, dtd_dir, input_csv, output_csv) = (
       None, None, None, None, None,
       None, None, None, None, None, None)

  request = flask.request
  # determine input
  input_method, input_info = (request.json['inputMethod'],
                              request.json['inputInfo'])
  if input_method == 'input_table':
    input_table = input_info
    try:
      dataset, table = input_table.split('.')
      if not verify_bq_table(dataset, table, EXPECTED_CSV_SCHEMA):
        error_msg = ('input table schema does not match the expected one. '
                     'Expecting: {}'.format(', '.join(EXPECTED_CSV_SCHEMA)))
        return flask.jsonify(error=400, text=error_msg), 400
    except exceptions.NotFound:
      return flask.jsonify(error=400, text='unable to locate input data'), 400
    job_data['original_query'] = 'SELECT * FROM {}'.format(input_table)
  elif input_method == 'input_query':
    input_query = input_info
    job_data['original_query'] = input_query
    try:
      get_bq_rows(input_query)
    except exceptions.BadRequest:
      error_msg = 'invalid input query'
      return flask.jsonify(error=400, text=error_msg), 400
  elif input_method == 'input_csv':
    input_csv = input_info
  else:
    error_msg = 'wrong input method provided'
    return flask.jsonify(error=400, text=error_msg), 400

  # Determine output
  output_method, output_info = (request.json['outputMethod'],
                                request.json['outputInfo'])
  job_data['deid_table'] = output_info
  if output_method == 'deid_table':
    deid_table = output_info
    dataset, table = deid_table.split('.')
    try:
      if not verify_bq_table(dataset, table, EXPECTED_OUTPUT_SCHEMA):
        error_msg = ('output table schema does not match the expected one. '
                     'Expecting: {}'.format(', '.join(EXPECTED_OUTPUT_SCHEMA)))
        return flask.jsonify(error=400, text=error_msg), 400
    except exceptions.NotFound:
      # if table not found, a new one will be created
      pass
  elif output_method == 'output_csv':
    output_csv = output_info
  else:
    error_msg = 'wrong output method provided'
    return flask.jsonify(error=400, text=error_msg), 400

  deid_config_json = run_deid_lib.parse_config_file(
      app.config['DEID_CONFIG_FILE'])

  findings_table = request.json.get('findingsTable')
  job_data['findings_table'] = findings_table
  try:
    dataset, table = findings_table.split('.')
    if not verify_bq_table(dataset, table, EXPECTED_FINDINGS_SCHEMA):
      error_msg = ('findings table schema does not match the expected one. '
                   'Expecting: {}'.format(', '.join(EXPECTED_FINDINGS_SCHEMA)))
      return flask.jsonify(error=400, text=error_msg), 400
  except exceptions.NotFound:
    # if table not found, a new one will be created
    pass

  mae_table = request.json.get('maeTable')
  mae_dir = request.json.get('maeDir')
  batch_size = request.json.get('batchSize') or 1

  pipeline_args = ['--project', app.config['PROJECT_ID']]

  deid_job = model.create(model.DeidJobTable, job_data)
  errors = run_deid_lib.run_pipeline(
      input_query, input_table, deid_table, findings_table, mae_dir, mae_table,
      deid_config_json, mae_task_name, app.config['PROJECT_ID'], storage.Client,
      bq_client, bigquery.job.QueryJobConfig, app.config['DLP_API_NAME'],
      batch_size, dtd_dir, input_csv, output_csv, deid_job.timestamp,
      pipeline_args)

  if errors:
    deid_job.update(status=400, log_trace=errors)
    return flask.jsonify(error=400, text=errors), 400

  deid_job.update(status=200)
  return flask.jsonify(result='success'), 200
Esempio n. 6
0
    def testReBatchDeid(self, mock_bq_source_fn, mock_bq_sink_fn,
                        mock_build_fn):
        mock_bq_sink_fn.side_effect = partial(self.make_sink, _TABLE_TO_SCHEMA)

        mock_bq_source_fn.return_value = beam_testutil.FakeSource()
        mock_bq_source_fn.return_value._records = [{
            'first_name': 'Boaty',
            'last_name': 'McBoatface',
            'note': 'text and PID and MORE PID',
            'patient_id': '111',
            'record_number': '1'
        }, {
            'first_name': 'Zephod',
            'last_name': 'Beeblebrox',
            'note': 'note2 text',
            'patient_id': '222',
            'record_number': '2'
        }]

        deid_response1 = {
            'item': {
                'table': {
                    'rows': [{
                        'values': [
                            sval('Boaty'),
                            sval('McBoatface'),
                            sval('note1 redacted'),
                            sval('111'),
                            sval('1')
                        ]
                    }],
                    'headers':
                    DEID_HEADERS
                }
            }
        }
        deid_response2 = {
            'item': {
                'table': {
                    'rows': [{
                        'values': [
                            sval('Zephod'),
                            sval('Beeblebrox'),
                            sval('note2 redacted'),
                            sval('222'),
                            sval('2')
                        ]
                    }],
                    'headers':
                    DEID_HEADERS
                }
            }
        }

        empty_locations = [{'recordLocation': {'tableLocation': {}}}]
        findings1 = {
            'findings': [{
                'location': {
                    'codepointRange': {
                        'start': '9',
                        'end': '12'
                    },
                    'contentLocations': empty_locations
                },
                'infoType': {
                    'name': 'PHONE_NUMBER'
                }
            }]
        }
        findings2 = {
            'findings': [{
                'location': {
                    'codepointRange': {
                        'start': '17',
                        'end': '25'
                    },
                    'contentLocations': empty_locations
                },
                'infoType': {
                    'name': 'US_MALE_NAME'
                }
            }]
        }
        inspect_response_truncated = {'result': {'findingsTruncated': 'True'}}
        inspect_responses = [
            inspect_response_truncated, {
                'result': findings1
            }, {
                'result': findings2
            }
        ]

        def inspect_execute():
            response = inspect_responses[inspect_execute.call_count]
            inspect_execute.call_count += 1
            return response

        inspect_execute.call_count = 0
        fake_content = Mock()
        fake_content.inspect.return_value = Mock(execute=inspect_execute)

        deid_responses = ['Exception', deid_response1, deid_response2]

        def deid_execute():
            response = deid_responses[deid_execute.call_count]
            deid_execute.call_count += 1
            if response == 'Exception':
                content = (
                    '{"error": {"message": "Too many findings to de-identify. '
                    'Retry with a smaller request."}}')
                raise errors.HttpError(httplib2.Response({'status': 400}),
                                       content)
            return response

        deid_execute.call_count = 0
        fake_content.deidentify.return_value = Mock(execute=deid_execute)

        fake_projects = Mock(content=Mock(return_value=fake_content))
        fake_dlp = Mock(projects=Mock(return_value=fake_projects))
        mock_build_fn.return_value = fake_dlp

        query_job = Mock()
        rows = [[
            'Boaty', 'McBoatface', 'text and PID and MORE PID', '111', '1'
        ], ['Zephod', 'Beeblebrox', 'note2 text', '222', '2']]
        results_table = FakeBqResults(bq_schema(), rows)
        query_job.destination.fetch_data.return_value = results_table
        bq_client = Mock()
        bq_client.run_async_query.return_value = query_job

        deid_cfg_file = os.path.join(TESTDATA_DIR,
                                     'testdata/batch_config.json')

        run_deid_lib.run_pipeline('input_query',
                                  None,
                                  'deid_tbl',
                                  'findings_tbl',
                                  'gs://mae-bucket/mae-dir',
                                  'mae_tbl',
                                  deid_cfg_file,
                                  'InspectPhiTask',
                                  'fake-credentials',
                                  'project',
                                  testutil.FakeStorageClient,
                                  bq_client,
                                  None,
                                  'dlp',
                                  batch_size=2,
                                  dtd_dir=None,
                                  pipeline_args=None)

        expected_request_body = {}
        with open(os.path.join(TESTDATA_DIR,
                               'testdata/batch_request.json')) as f:
            expected_request_body = json.load(f)
        fake_content.deidentify.assert_called()
        _, kwargs = fake_content.deidentify.call_args_list[0]
        self.maxDiff = 10000
        self.assertEqual(ordered(expected_request_body),
                         ordered(kwargs['body']))

        self.assertEqual(beam_testutil.get_table('deid_tbl'),
                         EXPECTED_DEID_RESULT)
        self.assertEqual(EXPECTED_MAE1,
                         testutil.get_gcs_file('mae-bucket/mae-dir/111-1.xml'))
        self.assertEqual(EXPECTED_MAE2,
                         testutil.get_gcs_file('mae-bucket/mae-dir/222-2.xml'))
Esempio n. 7
0
    def testBatchDeid(self, mock_bq_source_fn, mock_bq_sink_fn, mock_build_fn):
        mock_bq_sink_fn.side_effect = partial(self.make_sink, _TABLE_TO_SCHEMA)

        mock_bq_source_fn.return_value = beam_testutil.FakeSource()
        mock_bq_source_fn.return_value._records = [{
            'first_name': 'Boaty',
            'last_name': 'McBoatface',
            'note': 'text and PID and MORE PID',
            'patient_id': '111',
            'record_number': '1'
        }, {
            'first_name': 'Zephod',
            'last_name': 'Beeblebrox',
            'note': 'note2 text',
            'patient_id': '222',
            'record_number': '2'
        }]

        deid_response = {
            'item': {
                'table': {
                    'rows': [{
                        'values': [
                            sval('Boaty'),
                            sval('McBoatface'),
                            sval('note1 redacted'),
                            sval('111'),
                            sval('1')
                        ]
                    }, {
                        'values': [
                            sval('Zephod'),
                            sval('Beeblebrox'),
                            sval('note2 redacted'),
                            sval('222'),
                            sval('2')
                        ]
                    }],
                    'headers':
                    DEID_HEADERS
                }
            }
        }
        findings = {
            'findings': [{
                'location': {
                    'codepointRange': {
                        'start': '9',
                        'end': '12'
                    },
                    'contentLocations': [{
                        'recordLocation': {
                            'tableLocation': {
                                'rowIndex': '0'
                            }
                        }
                    }]
                },
                'infoType': {
                    'name': 'PHONE_NUMBER'
                }
            }, {
                'location': {
                    'codepointRange': {
                        'start': '17',
                        'end': '25'
                    },
                    'contentLocations': [{
                        'recordLocation': {
                            'tableLocation': {
                                'rowIndex': '1'
                            }
                        }
                    }]
                },
                'infoType': {
                    'name': 'US_MALE_NAME'
                }
            }]
        }
        inspect_response = {'result': findings}
        fake_content = Mock()
        fake_content.inspect.return_value = Mock(execute=Mock(
            return_value=inspect_response))
        fake_content.deidentify.return_value = Mock(execute=Mock(
            return_value=deid_response))
        fake_projects = Mock(content=Mock(return_value=fake_content))
        fake_dlp = Mock(projects=Mock(return_value=fake_projects))
        mock_build_fn.return_value = fake_dlp

        query_job = Mock()
        rows = [[
            'Boaty', 'McBoatface', 'text and PID and MORE PID', '111', '1'
        ], ['Zephod', 'Beeblebrox', 'note2 text', '222', '2']]
        results_table = FakeBqResults(bq_schema(), rows)
        query_job.destination.fetch_data.return_value = results_table
        bq_client = Mock()
        bq_client.run_async_query.return_value = query_job

        deid_cfg_file = os.path.join(TESTDATA_DIR,
                                     'testdata/batch_config.json')

        run_deid_lib.run_pipeline('input_query',
                                  None,
                                  'deid_tbl',
                                  'findings_tbl',
                                  'gs://mae-bucket/mae-dir',
                                  'mae_tbl',
                                  deid_cfg_file,
                                  'InspectPhiTask',
                                  'fake-credentials',
                                  'project',
                                  testutil.FakeStorageClient,
                                  bq_client,
                                  None,
                                  'dlp',
                                  batch_size=2,
                                  dtd_dir=None,
                                  pipeline_args=None)

        expected_request_body = {}
        with open(os.path.join(TESTDATA_DIR,
                               'testdata/batch_request.json')) as f:
            expected_request_body = json.load(f)
        fake_content.deidentify.assert_called_once()
        _, kwargs = fake_content.deidentify.call_args
        self.maxDiff = 10000
        self.assertEqual(ordered(expected_request_body),
                         ordered(kwargs['body']))

        self.assertEqual(beam_testutil.get_table('deid_tbl'),
                         EXPECTED_DEID_RESULT)
        self.assertEqual(EXPECTED_MAE1,
                         testutil.get_gcs_file('mae-bucket/mae-dir/111-1.xml'))
        self.assertEqual(EXPECTED_MAE2,
                         testutil.get_gcs_file('mae-bucket/mae-dir/222-2.xml'))
Esempio n. 8
0
    def testMultiColumnDeid(self, mock_bq_source_fn, mock_bq_sink_fn,
                            mock_build_fn):
        table_to_schema = _TABLE_TO_SCHEMA.copy()
        table_to_schema['deid_tbl'] += ', last_name:STRING'
        mock_bq_sink_fn.side_effect = partial(self.make_sink, table_to_schema)

        mock_bq_source_fn.return_value = beam_testutil.FakeSource()
        mock_bq_source_fn.return_value._records = [{
            'first_name': 'Boaty',
            'last_name': 'McBoatface',
            'note': 'text and PID and MORE PID',
            'patient_id': '111',
            'record_number': '1'
        }]

        deid_response = {
            'item': {
                'table': {
                    'rows': [{
                        'values': [{
                            'stringValue': 'deidtext'
                        }, {
                            'stringValue': 'myname'
                        }]
                    }],
                    'headers': [{
                        'name': 'note'
                    }, {
                        'name': 'last_name'
                    }]
                }
            }
        }
        empty_locations = [{'recordLocation': {'tableLocation': {}}}]
        findings = {
            'findings': [{
                'location': {
                    'codepointRange': {
                        'start': '17',
                        'end': '25'
                    },
                    'contentLocations': empty_locations
                },
                'infoType': {
                    'name': 'PHONE_NUMBER'
                }
            }, {
                'location': {
                    'codepointRange': {
                        'start': '9',
                        'end': '12'
                    },
                    'contentLocations': empty_locations
                },
                'infoType': {
                    'name': 'US_CENSUS_NAME'
                }
            }, {
                'location': {
                    'codepointRange': {
                        'start': '9',
                        'end': '12'
                    },
                    'contentLocations': [{
                        'recordLocation': {
                            'tableLocation': {
                                'rowIndex': '0'
                            }
                        }
                    }]
                },
                'infoType': {
                    'name': 'US_MALE_NAME'
                }
            }]
        }
        inspect_response = {'result': findings}
        fake_content = Mock()
        fake_content.inspect.return_value = Mock(execute=Mock(
            return_value=inspect_response))
        fake_content.deidentify.return_value = Mock(execute=Mock(
            return_value=deid_response))
        fake_projects = Mock(content=Mock(return_value=fake_content))
        fake_dlp = Mock(projects=Mock(return_value=fake_projects))
        mock_build_fn.return_value = fake_dlp

        query_job = Mock()
        rows = [['Boaty', 'McBoatface', 'note', 'id', 'recordnum']]
        results_table = FakeBqResults(bq_schema(), rows)
        query_job.destination.fetch_data.return_value = results_table
        bq_client = Mock()
        bq_client.run_async_query.return_value = query_job

        deid_cfg_file = os.path.join(TESTDATA_DIR,
                                     'testdata/multi_column_config.json')

        mae_dir = ''  # Not compatible with multi-column.
        mae_table = ''  # Not compatible with multi-column.
        run_deid_lib.run_pipeline('input_query',
                                  None,
                                  'deid_tbl',
                                  'findings_tbl',
                                  mae_dir,
                                  mae_table,
                                  deid_cfg_file,
                                  'InspectPhiTask',
                                  'fake-credentials',
                                  'project',
                                  testutil.FakeStorageClient,
                                  bq_client,
                                  None,
                                  'dlp',
                                  batch_size=1,
                                  dtd_dir=None,
                                  pipeline_args=None)

        request_body = {}
        with open(
                os.path.join(TESTDATA_DIR,
                             'testdata/multi_column_request.json')) as f:
            request_body = json.load(f)
        fake_content.deidentify.assert_called_once()
        _, kwargs = fake_content.deidentify.call_args
        self.maxDiff = 10000
        self.assertEqual(ordered(request_body), ordered(kwargs['body']))

        self.assertEqual(beam_testutil.get_table('deid_tbl'),
                         [{
                             'patient_id': '111',
                             'record_number': '1',
                             'note': 'deidtext',
                             'last_name': 'myname'
                         }])
Esempio n. 9
0
    def testE2E(self, mock_bq_source_fn, mock_bq_sink_fn, mock_build_fn):
        table_to_schema = _TABLE_TO_SCHEMA.copy()
        table_to_schema['deid_tbl'] += ', field_transform_col:STRING'
        mock_bq_sink_fn.side_effect = partial(self.make_sink, table_to_schema)

        mock_bq_source_fn.return_value = beam_testutil.FakeSource()
        mock_bq_source_fn.return_value._records = [{
            'first_name':
            'Boaty',
            'last_name':
            'McBoatface',
            'note':
            'text and PID and MORE PID',
            'patient_id':
            '111',
            'record_number':
            '1',
            'field_transform_col':
            'transform me!'
        }]

        deid_response = {
            'item': {
                'table': {
                    'rows': [{
                        'values': [{
                            'stringValue': 'deid_resp_val'
                        }, {
                            'stringValue': 'transformed!!'
                        }]
                    }],
                    'headers': [{
                        'name': 'note'
                    }, {
                        'name': 'field_transform_col'
                    }]
                }
            }
        }
        empty_locations = [{'recordLocation': {'tableLocation': {}}}]
        findings = {
            'findings': [{
                'location': {
                    'codepointRange': {
                        'start': '17',
                        'end': '25'
                    },
                    'contentLocations': empty_locations
                },
                'infoType': {
                    'name': 'PHONE_NUMBER'
                }
            }, {
                'location': {
                    'codepointRange': {
                        'start': '9',
                        'end': '12'
                    },
                    'contentLocations': empty_locations
                },
                'infoType': {
                    'name': 'US_CENSUS_NAME'
                }
            }, {
                'location': {
                    'codepointRange': {
                        'start': '9',
                        'end': '12'
                    },
                    'contentLocations': empty_locations
                },
                'infoType': {
                    'name': 'US_MALE_NAME'
                }
            }]
        }
        inspect_response = {'result': findings}
        fake_content = Mock()
        fake_content.inspect.return_value = Mock(execute=Mock(
            return_value=inspect_response))
        fake_content.deidentify.return_value = Mock(execute=Mock(
            return_value=deid_response))
        fake_projects = Mock(content=Mock(return_value=fake_content))
        fake_dlp = Mock(projects=Mock(return_value=fake_projects))
        mock_build_fn.return_value = fake_dlp

        query_job = Mock()
        rows = [['Boaty', 'McBoatface', 'note', 'id', 'recordnum']]
        results_table = FakeBqResults(bq_schema(), rows)
        query_job.destination.fetch_data.return_value = results_table
        bq_client = Mock()
        bq_client.run_async_query.return_value = query_job

        deid_cfg = os.path.join(TESTDATA_DIR, 'testdata/config.json')
        dtd_dir = tempfile.mkdtemp()
        run_deid_lib.run_pipeline('input_query',
                                  None,
                                  'deid_tbl',
                                  'findings_tbl',
                                  'gs://mae-bucket/mae-dir',
                                  'mae_tbl',
                                  deid_cfg,
                                  'InspectPhiTask',
                                  'fake-credentials',
                                  'project',
                                  testutil.FakeStorageClient,
                                  bq_client,
                                  None,
                                  'dlp',
                                  batch_size=1,
                                  dtd_dir=dtd_dir,
                                  pipeline_args=None)

        request_body = {}
        with open(os.path.join(TESTDATA_DIR, 'testdata/request.json')) as f:
            request_body = json.load(f)
        fake_content.deidentify.assert_called_once()
        _, kwargs = fake_content.deidentify.call_args
        self.maxDiff = 10000
        self.assertEqual(ordered(request_body), ordered(kwargs['body']))

        with open(os.path.join(TESTDATA_DIR, 'mae_testdata',
                               'sample.xml')) as f:
            contents = f.read()
            self.assertEqual(
                testutil.get_gcs_file('mae-bucket/mae-dir/111-1.xml'),
                contents)
            self.assertEqual(beam_testutil.get_table('mae_tbl'),
                             [{
                                 'record_id': '111-1',
                                 'xml': contents
                             }])
        with open(os.path.join(TESTDATA_DIR, 'mae_testdata',
                               'sample.dtd')) as f:
            contents = f.read()
            self.assertEqual(
                testutil.get_gcs_file('mae-bucket/mae-dir/classification.dtd'),
                contents)
            with open(os.path.join(dtd_dir,
                                   'classification.dtd')) as local_dtd:
                self.assertEqual(local_dtd.read(), contents)

        self.assertEqual(beam_testutil.get_table('deid_tbl'),
                         [{
                             'patient_id': '111',
                             'record_number': '1',
                             'note': 'deid_resp_val',
                             'field_transform_col': 'transformed!!'
                         }])
        self.assertEqual(beam_testutil.get_table('findings_tbl'),
                         [{
                             'patient_id': '111',
                             'record_number': '1',
                             'findings': str(findings)
                         }])