def testE2eBigquery(self, mock_bq_source_fn, mock_bq_sink_fn,
                        mock_utcnow_fn):
        def make_sink(table_name, schema, write_disposition):  # pylint: disable=unused-argument
            return beam_testutil.FakeSink(table_name)

        mock_bq_sink_fn.side_effect = make_sink
        now = 'current time'
        mock_utcnow_fn.return_value = now

        tp_tag = tag_template.format('TypeA', 0, 5)
        fp_tag = tag_template.format('TypeA', 8, 10)
        fn_tag = tag_template.format('TypeA', 11, 13)
        fn2_tag = tag_template.format('TypeA', 15, 19)
        findings_tags = '\n'.join([tp_tag, fp_tag])
        golden_tags = '\n'.join([tp_tag, fn_tag, fn2_tag])

        mock_bq_source_fn.return_value = beam_testutil.FakeSource()
        mock_bq_source_fn.return_value._records = [{
            'findings_record_id':
            '111-1',
            'findings_xml':
            xml_template.format(findings_tags),
            'golden_xml':
            xml_template.format(golden_tags)
        }]

        types_to_ignore = ['ignore']
        # These features are tested in testE2eGCS.
        input_pattern, golden_dir, results_dir, per_note_table, debug_table = (
            None, None, None, None, None)
        mae_input_query = 'SELECT * from [project.dataset.table]'
        mae_golden_table = 'project.dataset.golden_table'
        run_pipeline_lib.run_pipeline(input_pattern,
                                      golden_dir,
                                      results_dir,
                                      mae_input_query,
                                      mae_golden_table,
                                      False,
                                      'results_table',
                                      per_note_table,
                                      debug_table,
                                      types_to_ignore,
                                      pipeline_args=None)

        # Check that we generated the query correctly.
        mock_bq_source_fn.assert_called_with(
            query=('SELECT findings.record_id, findings.xml, golden.xml FROM '
                   '(SELECT * from [project.dataset.table]) AS findings '
                   'LEFT JOIN [project.dataset.golden_table] AS golden '
                   'ON findings.record_id=golden.record_id'))

        # Check we wrote the correct results to BigQuery.
        expected_results = [{
            'info_type': 'ALL',
            'recall': 0.333333,
            'precision': 0.5,
            'f_score': 0.4,
            'true_positives': 1,
            'false_positives': 1,
            'false_negatives': 2
        }, {
            'info_type': u'TypeA',
            'recall': 0.333333,
            'precision': 0.5,
            'f_score': 0.4,
            'true_positives': 1,
            'false_positives': 1,
            'false_negatives': 2
        }]
        for r in expected_results:
            r.update({'timestamp': now})
        actual_results = sorted(beam_testutil.get_table('results_table'),
                                key=lambda x: x['info_type'])
        self.assertEqual([normalize_dict_floats(r) for r in expected_results],
                         [normalize_dict_floats(r) for r in actual_results])
    def testE2eGCS(self, fake_client_fn, mock_bq_sink_fn, mock_utcnow_fn):
        def make_sink(table_name, schema, write_disposition):  # pylint: disable=unused-argument
            return beam_testutil.FakeSink(table_name)

        mock_bq_sink_fn.side_effect = make_sink
        now = 'current time'
        mock_utcnow_fn.return_value = now

        input_pattern = 'gs://bucketname/input/*'
        golden_dir = 'gs://bucketname/goldens'
        results_dir = 'gs://bucketname/results'
        storage_client = testutil.FakeStorageClient()
        fake_client_fn.return_value = storage_client

        tp_tag = tag_template.format('TypeA', 0, 5)
        fp_tag = tag_template.format('TypeA', 8, 10)
        fn_tag = tag_template.format('TypeA', 11, 13)
        fn2_tag = tag_template.format('TypeA', 15, 19)
        findings_tags = '\n'.join([tp_tag, fp_tag])
        golden_tags = '\n'.join([tp_tag, fn_tag, fn2_tag])
        testutil.set_gcs_file('bucketname/input/1-1.xml',
                              xml_template.format(findings_tags))
        testutil.set_gcs_file('bucketname/goldens/1-1.xml',
                              xml_template.format(golden_tags))

        tp2_tag = tag_template.format('TypeB', 20, 21)
        # False negative + false positive for entity matching, but true positive for
        # binary token matching.
        entity_fp_tag = tag_template.format('TypeX', 30, 35)
        entity_fn_tag = tag_template.format('TypeY', 30, 35)
        # Two tokens are tagged as one in the golden. This is not a match for entity
        # matching, but is two matches for binary token matching.
        partial_tag1 = tag_template.format('TypeA', 36, 41)
        partial_tag2 = tag_template.format('TypeA', 42, 47)
        partial_tag3 = tag_template.format('TypeA', 48, 54)
        multi_token_tag = tag_template.format('TypeA', 36, 54)
        ignored_tag = tag_template.format('ignore', 55, 57)
        findings_tags = '\n'.join([
            tp_tag, tp2_tag, entity_fp_tag, partial_tag1, partial_tag2,
            partial_tag3, ignored_tag
        ])
        golden_tags = '\n'.join(
            [tp_tag, tp2_tag, entity_fn_tag, multi_token_tag])
        testutil.set_gcs_file('bucketname/input/1-2.xml',
                              xml_template.format(findings_tags))
        testutil.set_gcs_file('bucketname/goldens/1-2.xml',
                              xml_template.format(golden_tags))
        self.old_write_to_text = beam.io.WriteToText
        beam.io.WriteToText = beam_testutil.DummyWriteTransform
        types_to_ignore = ['ignore']
        mae_input_query = None
        mae_golden_table = None
        run_pipeline_lib.run_pipeline(input_pattern,
                                      golden_dir,
                                      results_dir,
                                      mae_input_query,
                                      mae_golden_table,
                                      True,
                                      'results_table',
                                      'per_note_results_table',
                                      'debug_output_table',
                                      types_to_ignore,
                                      pipeline_args=None)
        beam.io.WriteToText = self.old_write_to_text

        # Check we wrote the correct results to BigQuery.
        expected_results = [{
            'info_type': 'ALL',
            'recall': 0.7777777777777778,
            'precision': 0.875,
            'f_score': 0.823529411764706,
            'true_positives': 7,
            'false_positives': 1,
            'false_negatives': 2
        }, {
            'info_type': u'TypeA',
            'recall': 0.7142857142857143,
            'precision': 0.8333333333333334,
            'f_score': 0.7692307692307694,
            'true_positives': 5,
            'false_positives': 1,
            'false_negatives': 2
        }, {
            'info_type': u'TypeB',
            'recall': 1.0,
            'precision': 1.0,
            'f_score': 1.0,
            'true_positives': 1,
            'false_positives': 0,
            'false_negatives': 0
        }, {
            'info_type': u'TypeY',
            'recall': 1.0,
            'precision': 1.0,
            'f_score': 1.0,
            'true_positives': 1,
            'false_positives': 0,
            'false_negatives': 0
        }]
        for r in expected_results:
            r.update({'timestamp': now})
        actual_results = sorted(beam_testutil.get_table('results_table'),
                                key=lambda x: x['info_type'])
        self.assertEqual([normalize_dict_floats(r) for r in expected_results],
                         [normalize_dict_floats(r) for r in actual_results])

        full_text = 'word1   w2 w3  wrd4 5 word6   word7 multi token entity w8'

        def debug_info(record_id, classification, text, info_type, start, end):
            location = full_text.find(text)
            context = (full_text[0:location] + '{[--' + text + '--]}' +
                       full_text[location + len(text):])
            return {
                'record_id': record_id,
                'classification': classification,
                'text': text,
                'info_type': info_type,
                'context': context,
                'start': start,
                'end': end
            }

        expected_debug_info = [
            debug_info('1-1', 'true_positive', 'word1', 'TypeA', 0, 5),
            debug_info('1-1', 'false_positive', 'w2', 'TypeA', 8, 10),
            debug_info('1-1', 'false_negative', 'w3', 'TypeA', 11, 13),
            debug_info('1-1', 'false_negative', 'wrd4', 'TypeA', 15, 19),
            debug_info('1-2', 'true_positive', 'word1', 'TypeA', 0, 5),
            debug_info('1-2', 'true_positive', '5', 'TypeB', 20, 21),
            debug_info('1-2', 'true_positive', 'word7', 'TypeY', 30, 35),
            debug_info('1-2', 'true_positive', 'multi', 'TypeA', 36, 41),
            debug_info('1-2', 'true_positive', 'token', 'TypeA', 42, 47),
            debug_info('1-2', 'true_positive', 'entity', 'TypeA', 48, 54),
        ]
        for r in expected_debug_info:
            r.update({'timestamp': now})

        def s(l):
            return sorted(l, key=lambda x: x['record_id'] + x['context'])

        self.assertEqual(s(expected_debug_info),
                         s(beam_testutil.get_table('debug_output_table')))

        expected_per_note = [{
            'record_id': '1-1',
            'precision': 0.5,
            'recall': 0.3333333333333333,
            'f_score': 0.4,
            'true_positives': 1,
            'false_positives': 1,
            'false_negatives': 2
        }, {
            'record_id': '1-2',
            'precision': 1.0,
            'recall': 1.0,
            'f_score': 1.0,
            'true_positives': 6,
            'false_positives': 0,
            'false_negatives': 0
        }]
        for r in expected_per_note:
            r.update({'timestamp': now})
        actual_results = sorted(
            beam_testutil.get_table('per_note_results_table'),
            key=lambda x: x['record_id'])
        self.assertEqual([normalize_dict_floats(r) for r in expected_per_note],
                         [normalize_dict_floats(r) for r in actual_results])

        # Check we wrote the correct results to GCS.
        expected_text = ''
        with open(os.path.join(TESTDATA_DIR, 'expected_results')) as f:
            expected_text = f.read()
        expected_results = results_pb2.Results()
        text_format.Merge(expected_text, expected_results)
        results = results_pb2.Results()
        text_format.Merge(
            testutil.get_gcs_file('bucketname/results/aggregate_results.txt'),
            results)
        self.assertEqual(normalize_floats(expected_results),
                         normalize_floats(results))

        # Check the per-file results were written correctly.
        expected_result1 = results_pb2.IndividualResult()
        text_format.Merge(
            """
record_id: "1-1"
stats {
  true_positives: 1
  false_positives: 1
  false_negatives: 2
  precision: 0.5
  recall: 0.333333333333
  f_score: 0.4
}""", expected_result1)
        expected_result2 = results_pb2.IndividualResult()
        text_format.Merge(
            """
record_id: "1-2"
stats {
  true_positives: 6
  precision: 1.0
  recall: 1.0
  f_score: 1.0
}""", expected_result2)
        normalize_floats(expected_result1)
        normalize_floats(expected_result2)
        full_text = testutil.get_gcs_file(
            'bucketname/results/per-note-results')
        actual_results = []
        for record in sorted(full_text.split('\n\n')):
            if not record:
                continue
            actual_result = results_pb2.IndividualResult()
            text_format.Merge(record, actual_result)
            actual_results.append(normalize_floats(actual_result))

        self.assertEqual([expected_result1, expected_result2], actual_results)
Esempio n. 3
0
    def testReBatchDeid(self, mock_bq_source_fn, mock_bq_sink_fn,
                        mock_build_fn):
        mock_bq_sink_fn.side_effect = partial(self.make_sink, _TABLE_TO_SCHEMA)

        mock_bq_source_fn.return_value = beam_testutil.FakeSource()
        mock_bq_source_fn.return_value._records = [{
            'first_name': 'Boaty',
            'last_name': 'McBoatface',
            'note': 'text and PID and MORE PID',
            'patient_id': '111',
            'record_number': '1'
        }, {
            'first_name': 'Zephod',
            'last_name': 'Beeblebrox',
            'note': 'note2 text',
            'patient_id': '222',
            'record_number': '2'
        }]

        deid_response1 = {
            'item': {
                'table': {
                    'rows': [{
                        'values': [
                            sval('Boaty'),
                            sval('McBoatface'),
                            sval('note1 redacted'),
                            sval('111'),
                            sval('1')
                        ]
                    }],
                    'headers':
                    DEID_HEADERS
                }
            }
        }
        deid_response2 = {
            'item': {
                'table': {
                    'rows': [{
                        'values': [
                            sval('Zephod'),
                            sval('Beeblebrox'),
                            sval('note2 redacted'),
                            sval('222'),
                            sval('2')
                        ]
                    }],
                    'headers':
                    DEID_HEADERS
                }
            }
        }

        empty_locations = [{'recordLocation': {'tableLocation': {}}}]
        findings1 = {
            'findings': [{
                'location': {
                    'codepointRange': {
                        'start': '9',
                        'end': '12'
                    },
                    'contentLocations': empty_locations
                },
                'infoType': {
                    'name': 'PHONE_NUMBER'
                }
            }]
        }
        findings2 = {
            'findings': [{
                'location': {
                    'codepointRange': {
                        'start': '17',
                        'end': '25'
                    },
                    'contentLocations': empty_locations
                },
                'infoType': {
                    'name': 'US_MALE_NAME'
                }
            }]
        }
        inspect_response_truncated = {'result': {'findingsTruncated': 'True'}}
        inspect_responses = [
            inspect_response_truncated, {
                'result': findings1
            }, {
                'result': findings2
            }
        ]

        def inspect_execute():
            response = inspect_responses[inspect_execute.call_count]
            inspect_execute.call_count += 1
            return response

        inspect_execute.call_count = 0
        fake_content = Mock()
        fake_content.inspect.return_value = Mock(execute=inspect_execute)

        deid_responses = ['Exception', deid_response1, deid_response2]

        def deid_execute():
            response = deid_responses[deid_execute.call_count]
            deid_execute.call_count += 1
            if response == 'Exception':
                content = (
                    '{"error": {"message": "Too many findings to de-identify. '
                    'Retry with a smaller request."}}')
                raise errors.HttpError(httplib2.Response({'status': 400}),
                                       content)
            return response

        deid_execute.call_count = 0
        fake_content.deidentify.return_value = Mock(execute=deid_execute)

        fake_projects = Mock(content=Mock(return_value=fake_content))
        fake_dlp = Mock(projects=Mock(return_value=fake_projects))
        mock_build_fn.return_value = fake_dlp

        query_job = Mock()
        rows = [[
            'Boaty', 'McBoatface', 'text and PID and MORE PID', '111', '1'
        ], ['Zephod', 'Beeblebrox', 'note2 text', '222', '2']]
        results_table = FakeBqResults(bq_schema(), rows)
        query_job.destination.fetch_data.return_value = results_table
        bq_client = Mock()
        bq_client.run_async_query.return_value = query_job

        deid_cfg_file = os.path.join(TESTDATA_DIR,
                                     'testdata/batch_config.json')

        run_deid_lib.run_pipeline('input_query',
                                  None,
                                  'deid_tbl',
                                  'findings_tbl',
                                  'gs://mae-bucket/mae-dir',
                                  'mae_tbl',
                                  deid_cfg_file,
                                  'InspectPhiTask',
                                  'fake-credentials',
                                  'project',
                                  testutil.FakeStorageClient,
                                  bq_client,
                                  None,
                                  'dlp',
                                  batch_size=2,
                                  dtd_dir=None,
                                  pipeline_args=None)

        expected_request_body = {}
        with open(os.path.join(TESTDATA_DIR,
                               'testdata/batch_request.json')) as f:
            expected_request_body = json.load(f)
        fake_content.deidentify.assert_called()
        _, kwargs = fake_content.deidentify.call_args_list[0]
        self.maxDiff = 10000
        self.assertEqual(ordered(expected_request_body),
                         ordered(kwargs['body']))

        self.assertEqual(beam_testutil.get_table('deid_tbl'),
                         EXPECTED_DEID_RESULT)
        self.assertEqual(EXPECTED_MAE1,
                         testutil.get_gcs_file('mae-bucket/mae-dir/111-1.xml'))
        self.assertEqual(EXPECTED_MAE2,
                         testutil.get_gcs_file('mae-bucket/mae-dir/222-2.xml'))
Esempio n. 4
0
    def testBatchDeid(self, mock_bq_source_fn, mock_bq_sink_fn, mock_build_fn):
        mock_bq_sink_fn.side_effect = partial(self.make_sink, _TABLE_TO_SCHEMA)

        mock_bq_source_fn.return_value = beam_testutil.FakeSource()
        mock_bq_source_fn.return_value._records = [{
            'first_name': 'Boaty',
            'last_name': 'McBoatface',
            'note': 'text and PID and MORE PID',
            'patient_id': '111',
            'record_number': '1'
        }, {
            'first_name': 'Zephod',
            'last_name': 'Beeblebrox',
            'note': 'note2 text',
            'patient_id': '222',
            'record_number': '2'
        }]

        deid_response = {
            'item': {
                'table': {
                    'rows': [{
                        'values': [
                            sval('Boaty'),
                            sval('McBoatface'),
                            sval('note1 redacted'),
                            sval('111'),
                            sval('1')
                        ]
                    }, {
                        'values': [
                            sval('Zephod'),
                            sval('Beeblebrox'),
                            sval('note2 redacted'),
                            sval('222'),
                            sval('2')
                        ]
                    }],
                    'headers':
                    DEID_HEADERS
                }
            }
        }
        findings = {
            'findings': [{
                'location': {
                    'codepointRange': {
                        'start': '9',
                        'end': '12'
                    },
                    'contentLocations': [{
                        'recordLocation': {
                            'tableLocation': {
                                'rowIndex': '0'
                            }
                        }
                    }]
                },
                'infoType': {
                    'name': 'PHONE_NUMBER'
                }
            }, {
                'location': {
                    'codepointRange': {
                        'start': '17',
                        'end': '25'
                    },
                    'contentLocations': [{
                        'recordLocation': {
                            'tableLocation': {
                                'rowIndex': '1'
                            }
                        }
                    }]
                },
                'infoType': {
                    'name': 'US_MALE_NAME'
                }
            }]
        }
        inspect_response = {'result': findings}
        fake_content = Mock()
        fake_content.inspect.return_value = Mock(execute=Mock(
            return_value=inspect_response))
        fake_content.deidentify.return_value = Mock(execute=Mock(
            return_value=deid_response))
        fake_projects = Mock(content=Mock(return_value=fake_content))
        fake_dlp = Mock(projects=Mock(return_value=fake_projects))
        mock_build_fn.return_value = fake_dlp

        query_job = Mock()
        rows = [[
            'Boaty', 'McBoatface', 'text and PID and MORE PID', '111', '1'
        ], ['Zephod', 'Beeblebrox', 'note2 text', '222', '2']]
        results_table = FakeBqResults(bq_schema(), rows)
        query_job.destination.fetch_data.return_value = results_table
        bq_client = Mock()
        bq_client.run_async_query.return_value = query_job

        deid_cfg_file = os.path.join(TESTDATA_DIR,
                                     'testdata/batch_config.json')

        run_deid_lib.run_pipeline('input_query',
                                  None,
                                  'deid_tbl',
                                  'findings_tbl',
                                  'gs://mae-bucket/mae-dir',
                                  'mae_tbl',
                                  deid_cfg_file,
                                  'InspectPhiTask',
                                  'fake-credentials',
                                  'project',
                                  testutil.FakeStorageClient,
                                  bq_client,
                                  None,
                                  'dlp',
                                  batch_size=2,
                                  dtd_dir=None,
                                  pipeline_args=None)

        expected_request_body = {}
        with open(os.path.join(TESTDATA_DIR,
                               'testdata/batch_request.json')) as f:
            expected_request_body = json.load(f)
        fake_content.deidentify.assert_called_once()
        _, kwargs = fake_content.deidentify.call_args
        self.maxDiff = 10000
        self.assertEqual(ordered(expected_request_body),
                         ordered(kwargs['body']))

        self.assertEqual(beam_testutil.get_table('deid_tbl'),
                         EXPECTED_DEID_RESULT)
        self.assertEqual(EXPECTED_MAE1,
                         testutil.get_gcs_file('mae-bucket/mae-dir/111-1.xml'))
        self.assertEqual(EXPECTED_MAE2,
                         testutil.get_gcs_file('mae-bucket/mae-dir/222-2.xml'))
Esempio n. 5
0
    def testMultiColumnDeid(self, mock_bq_source_fn, mock_bq_sink_fn,
                            mock_build_fn):
        table_to_schema = _TABLE_TO_SCHEMA.copy()
        table_to_schema['deid_tbl'] += ', last_name:STRING'
        mock_bq_sink_fn.side_effect = partial(self.make_sink, table_to_schema)

        mock_bq_source_fn.return_value = beam_testutil.FakeSource()
        mock_bq_source_fn.return_value._records = [{
            'first_name': 'Boaty',
            'last_name': 'McBoatface',
            'note': 'text and PID and MORE PID',
            'patient_id': '111',
            'record_number': '1'
        }]

        deid_response = {
            'item': {
                'table': {
                    'rows': [{
                        'values': [{
                            'stringValue': 'deidtext'
                        }, {
                            'stringValue': 'myname'
                        }]
                    }],
                    'headers': [{
                        'name': 'note'
                    }, {
                        'name': 'last_name'
                    }]
                }
            }
        }
        empty_locations = [{'recordLocation': {'tableLocation': {}}}]
        findings = {
            'findings': [{
                'location': {
                    'codepointRange': {
                        'start': '17',
                        'end': '25'
                    },
                    'contentLocations': empty_locations
                },
                'infoType': {
                    'name': 'PHONE_NUMBER'
                }
            }, {
                'location': {
                    'codepointRange': {
                        'start': '9',
                        'end': '12'
                    },
                    'contentLocations': empty_locations
                },
                'infoType': {
                    'name': 'US_CENSUS_NAME'
                }
            }, {
                'location': {
                    'codepointRange': {
                        'start': '9',
                        'end': '12'
                    },
                    'contentLocations': [{
                        'recordLocation': {
                            'tableLocation': {
                                'rowIndex': '0'
                            }
                        }
                    }]
                },
                'infoType': {
                    'name': 'US_MALE_NAME'
                }
            }]
        }
        inspect_response = {'result': findings}
        fake_content = Mock()
        fake_content.inspect.return_value = Mock(execute=Mock(
            return_value=inspect_response))
        fake_content.deidentify.return_value = Mock(execute=Mock(
            return_value=deid_response))
        fake_projects = Mock(content=Mock(return_value=fake_content))
        fake_dlp = Mock(projects=Mock(return_value=fake_projects))
        mock_build_fn.return_value = fake_dlp

        query_job = Mock()
        rows = [['Boaty', 'McBoatface', 'note', 'id', 'recordnum']]
        results_table = FakeBqResults(bq_schema(), rows)
        query_job.destination.fetch_data.return_value = results_table
        bq_client = Mock()
        bq_client.run_async_query.return_value = query_job

        deid_cfg_file = os.path.join(TESTDATA_DIR,
                                     'testdata/multi_column_config.json')

        mae_dir = ''  # Not compatible with multi-column.
        mae_table = ''  # Not compatible with multi-column.
        run_deid_lib.run_pipeline('input_query',
                                  None,
                                  'deid_tbl',
                                  'findings_tbl',
                                  mae_dir,
                                  mae_table,
                                  deid_cfg_file,
                                  'InspectPhiTask',
                                  'fake-credentials',
                                  'project',
                                  testutil.FakeStorageClient,
                                  bq_client,
                                  None,
                                  'dlp',
                                  batch_size=1,
                                  dtd_dir=None,
                                  pipeline_args=None)

        request_body = {}
        with open(
                os.path.join(TESTDATA_DIR,
                             'testdata/multi_column_request.json')) as f:
            request_body = json.load(f)
        fake_content.deidentify.assert_called_once()
        _, kwargs = fake_content.deidentify.call_args
        self.maxDiff = 10000
        self.assertEqual(ordered(request_body), ordered(kwargs['body']))

        self.assertEqual(beam_testutil.get_table('deid_tbl'),
                         [{
                             'patient_id': '111',
                             'record_number': '1',
                             'note': 'deidtext',
                             'last_name': 'myname'
                         }])
Esempio n. 6
0
    def testE2E(self, mock_bq_source_fn, mock_bq_sink_fn, mock_build_fn):
        table_to_schema = _TABLE_TO_SCHEMA.copy()
        table_to_schema['deid_tbl'] += ', field_transform_col:STRING'
        mock_bq_sink_fn.side_effect = partial(self.make_sink, table_to_schema)

        mock_bq_source_fn.return_value = beam_testutil.FakeSource()
        mock_bq_source_fn.return_value._records = [{
            'first_name':
            'Boaty',
            'last_name':
            'McBoatface',
            'note':
            'text and PID and MORE PID',
            'patient_id':
            '111',
            'record_number':
            '1',
            'field_transform_col':
            'transform me!'
        }]

        deid_response = {
            'item': {
                'table': {
                    'rows': [{
                        'values': [{
                            'stringValue': 'deid_resp_val'
                        }, {
                            'stringValue': 'transformed!!'
                        }]
                    }],
                    'headers': [{
                        'name': 'note'
                    }, {
                        'name': 'field_transform_col'
                    }]
                }
            }
        }
        empty_locations = [{'recordLocation': {'tableLocation': {}}}]
        findings = {
            'findings': [{
                'location': {
                    'codepointRange': {
                        'start': '17',
                        'end': '25'
                    },
                    'contentLocations': empty_locations
                },
                'infoType': {
                    'name': 'PHONE_NUMBER'
                }
            }, {
                'location': {
                    'codepointRange': {
                        'start': '9',
                        'end': '12'
                    },
                    'contentLocations': empty_locations
                },
                'infoType': {
                    'name': 'US_CENSUS_NAME'
                }
            }, {
                'location': {
                    'codepointRange': {
                        'start': '9',
                        'end': '12'
                    },
                    'contentLocations': empty_locations
                },
                'infoType': {
                    'name': 'US_MALE_NAME'
                }
            }]
        }
        inspect_response = {'result': findings}
        fake_content = Mock()
        fake_content.inspect.return_value = Mock(execute=Mock(
            return_value=inspect_response))
        fake_content.deidentify.return_value = Mock(execute=Mock(
            return_value=deid_response))
        fake_projects = Mock(content=Mock(return_value=fake_content))
        fake_dlp = Mock(projects=Mock(return_value=fake_projects))
        mock_build_fn.return_value = fake_dlp

        query_job = Mock()
        rows = [['Boaty', 'McBoatface', 'note', 'id', 'recordnum']]
        results_table = FakeBqResults(bq_schema(), rows)
        query_job.destination.fetch_data.return_value = results_table
        bq_client = Mock()
        bq_client.run_async_query.return_value = query_job

        deid_cfg = os.path.join(TESTDATA_DIR, 'testdata/config.json')
        dtd_dir = tempfile.mkdtemp()
        run_deid_lib.run_pipeline('input_query',
                                  None,
                                  'deid_tbl',
                                  'findings_tbl',
                                  'gs://mae-bucket/mae-dir',
                                  'mae_tbl',
                                  deid_cfg,
                                  'InspectPhiTask',
                                  'fake-credentials',
                                  'project',
                                  testutil.FakeStorageClient,
                                  bq_client,
                                  None,
                                  'dlp',
                                  batch_size=1,
                                  dtd_dir=dtd_dir,
                                  pipeline_args=None)

        request_body = {}
        with open(os.path.join(TESTDATA_DIR, 'testdata/request.json')) as f:
            request_body = json.load(f)
        fake_content.deidentify.assert_called_once()
        _, kwargs = fake_content.deidentify.call_args
        self.maxDiff = 10000
        self.assertEqual(ordered(request_body), ordered(kwargs['body']))

        with open(os.path.join(TESTDATA_DIR, 'mae_testdata',
                               'sample.xml')) as f:
            contents = f.read()
            self.assertEqual(
                testutil.get_gcs_file('mae-bucket/mae-dir/111-1.xml'),
                contents)
            self.assertEqual(beam_testutil.get_table('mae_tbl'),
                             [{
                                 'record_id': '111-1',
                                 'xml': contents
                             }])
        with open(os.path.join(TESTDATA_DIR, 'mae_testdata',
                               'sample.dtd')) as f:
            contents = f.read()
            self.assertEqual(
                testutil.get_gcs_file('mae-bucket/mae-dir/classification.dtd'),
                contents)
            with open(os.path.join(dtd_dir,
                                   'classification.dtd')) as local_dtd:
                self.assertEqual(local_dtd.read(), contents)

        self.assertEqual(beam_testutil.get_table('deid_tbl'),
                         [{
                             'patient_id': '111',
                             'record_number': '1',
                             'note': 'deid_resp_val',
                             'field_transform_col': 'transformed!!'
                         }])
        self.assertEqual(beam_testutil.get_table('findings_tbl'),
                         [{
                             'patient_id': '111',
                             'record_number': '1',
                             'findings': str(findings)
                         }])