def test_heel_analyses(self, mock_hpo_bucket): # Long-running test mock_hpo_bucket.return_value = self.get_mock_hpo_bucket() test_util.get_synpuf_results_files() # create randomized tables to bypass BQ rate limits random_string = str(randint(10000, 99999)) randomized_hpo_id = FAKE_HPO_ID + '_' + random_string # prepare self._load_dataset(randomized_hpo_id) test_util.populate_achilles(self.hpo_bucket, hpo_id=randomized_hpo_id, include_heel=False) # define tables achilles_heel_results = randomized_hpo_id + '_' + achilles_heel.ACHILLES_HEEL_RESULTS achilles_results_derived = randomized_hpo_id + '_' + achilles_heel.ACHILLES_RESULTS_DERIVED # run achilles heel achilles_heel.create_tables(randomized_hpo_id, True) achilles_heel.run_heel(hpo_id=randomized_hpo_id) # validate query = sql_wrangle.qualify_tables( 'SELECT COUNT(1) as num_rows FROM %s' % achilles_heel_results) response = bq_utils.query(query) rows = bq_utils.response2rows(response) self.assertEqual(ACHILLES_HEEL_RESULTS_COUNT, rows[0]['num_rows']) query = sql_wrangle.qualify_tables( 'SELECT COUNT(1) as num_rows FROM %s' % achilles_results_derived) response = bq_utils.query(query) rows = bq_utils.response2rows(response) self.assertEqual(ACHILLES_RESULTS_DERIVED_COUNT, rows[0]['num_rows']) # test new heel re-categorization errors = [ 2, 4, 5, 101, 200, 206, 207, 209, 400, 405, 406, 409, 411, 413, 500, 505, 506, 509, 600, 605, 606, 609, 613, 700, 705, 706, 709, 711, 713, 715, 716, 717, 800, 805, 806, 809, 813, 814, 906, 1006, 1609, 1805 ] query = sql_wrangle.qualify_tables( """SELECT analysis_id FROM {table_id} WHERE achilles_heel_warning LIKE 'ERROR:%' GROUP BY analysis_id""".format(table_id=achilles_heel_results)) response = bq_utils.query(query) rows = bq_utils.response2rows(response) actual_result = [row["analysis_id"] for row in rows] for analysis_id in actual_result: self.assertIn(analysis_id, errors) warnings = [ 4, 5, 7, 8, 9, 200, 210, 302, 400, 402, 412, 420, 500, 511, 512, 513, 514, 515, 602, 612, 620, 702, 712, 720, 802, 812, 820 ] query = sql_wrangle.qualify_tables( """SELECT analysis_id FROM {table_id} WHERE achilles_heel_warning LIKE 'WARNING:%' GROUP BY analysis_id""".format(table_id=achilles_heel_results)) response = bq_utils.query(query) rows = bq_utils.response2rows(response) actual_result = [row["analysis_id"] for row in rows] for analysis_id in actual_result: self.assertIn(analysis_id, warnings) notifications = [ 101, 103, 105, 114, 115, 118, 208, 301, 410, 610, 710, 810, 900, 907, 1000, 1800, 1807 ] query = sql_wrangle.qualify_tables( """SELECT analysis_id FROM {table_id} WHERE achilles_heel_warning LIKE 'NOTIFICATION:%' and analysis_id is not null GROUP BY analysis_id""".format(table_id=achilles_heel_results)) response = bq_utils.query(query) rows = bq_utils.response2rows(response) actual_result = [row["analysis_id"] for row in rows] for analysis_id in actual_result: self.assertIn(analysis_id, notifications)
def test_union_ehr(self, mock_hpo_info): self._load_datasets() input_tables_before = set(self._dataset_tables(self.input_dataset_id)) # output should be mapping tables and cdm tables output_tables_before = self._dataset_tables(self.output_dataset_id) mapping_tables = [ ehr_union.mapping_table_for(table) for table in cdm.tables_to_map() + [combine_ehr_rdr.PERSON_TABLE] ] output_cdm_tables = [ ehr_union.output_table_for(table) for table in resources.CDM_TABLES ] expected_output = set(output_tables_before + mapping_tables + output_cdm_tables) mock_hpo_info.return_value = [{ 'hpo_id': hpo_id } for hpo_id in self.hpo_ids] # perform ehr union ehr_union.main(self.input_dataset_id, self.output_dataset_id, self.project_id, [EXCLUDED_HPO_ID]) # input dataset should be unchanged input_tables_after = set(self._dataset_tables(self.input_dataset_id)) self.assertSetEqual(input_tables_before, input_tables_after) # fact_relationship from pitt hpo_unique_identifiers = ehr_union.get_hpo_offsets(self.hpo_ids) pitt_offset = hpo_unique_identifiers[PITT_HPO_ID] q = '''SELECT fact_id_1, fact_id_2 FROM `{input_dataset}.{hpo_id}_fact_relationship` where domain_concept_id_1 = 21 and domain_concept_id_2 = 21'''.format( input_dataset=self.input_dataset_id, hpo_id=PITT_HPO_ID) response = bq_utils.query(q) result = bq_utils.response2rows(response) expected_fact_id_1 = result[0]["fact_id_1"] + pitt_offset expected_fact_id_2 = result[0]["fact_id_2"] + pitt_offset q = '''SELECT fr.fact_id_1, fr.fact_id_2 FROM `{dataset_id}.unioned_ehr_fact_relationship` fr join `{dataset_id}._mapping_measurement` mm on fr.fact_id_1 = mm.measurement_id and mm.src_hpo_id = "{hpo_id}"'''.format( dataset_id=self.output_dataset_id, hpo_id=PITT_HPO_ID) response = bq_utils.query(q) result = bq_utils.response2rows(response) actual_fact_id_1, actual_fact_id_2 = result[0]["fact_id_1"], result[0][ "fact_id_2"] self.assertEqual(expected_fact_id_1, actual_fact_id_1) self.assertEqual(expected_fact_id_2, actual_fact_id_2) # mapping tables tables_to_map = cdm.tables_to_map() for table_to_map in tables_to_map: mapping_table = ehr_union.mapping_table_for(table_to_map) expected_fields = { 'src_table_id', 'src_%s_id' % table_to_map, '%s_id' % table_to_map, 'src_hpo_id', 'src_dataset_id' } mapping_table_info = bq_utils.get_table_info( mapping_table, dataset_id=self.output_dataset_id) mapping_table_fields = mapping_table_info.get('schema', dict()).get( 'fields', []) actual_fields = set([f['name'] for f in mapping_table_fields]) message = 'Table %s has fields %s when %s expected' % ( mapping_table, actual_fields, expected_fields) self.assertSetEqual(expected_fields, actual_fields, message) result_table = ehr_union.output_table_for(table_to_map) expected_num_rows = len(self.expected_tables[result_table]) actual_num_rows = int(mapping_table_info.get('numRows', -1)) message = 'Table %s has %s rows when %s expected' % ( mapping_table, actual_num_rows, expected_num_rows) self.assertEqual(expected_num_rows, actual_num_rows, message) # check for each output table for table_name in resources.CDM_TABLES: # output table exists and row count is sum of those submitted by hpos result_table = ehr_union.output_table_for(table_name) expected_rows = self.expected_tables[result_table] expected_count = len(expected_rows) table_info = bq_utils.get_table_info( result_table, dataset_id=self.output_dataset_id) actual_count = int(table_info.get('numRows')) msg = 'Unexpected row count in table {result_table} after ehr union'.format( result_table=result_table) self.assertEqual(expected_count, actual_count, msg) # TODO Compare table rows to expected accounting for the new ids and ignoring field types # q = 'SELECT * FROM {dataset}.{table}'.format(dataset=self.output_dataset_id, table=result_table) # query_response = bq_utils.query(q) # actual_rows = bq_utils.response2rows(query_response) # output table has clustering on person_id where applicable fields = resources.fields_for(table_name) field_names = [field['name'] for field in fields] if 'person_id' in field_names: self._table_has_clustering(table_info) actual_output = set(self._dataset_tables(self.output_dataset_id)) self.assertSetEqual(expected_output, actual_output) # explicit check that output person_ids are same as input nyc_person_table_id = bq_utils.get_table_id(NYC_HPO_ID, 'person') pitt_person_table_id = bq_utils.get_table_id(PITT_HPO_ID, 'person') q = '''SELECT DISTINCT person_id FROM ( SELECT person_id FROM {dataset_id}.{nyc_person_table_id} UNION ALL SELECT person_id FROM {dataset_id}.{pitt_person_table_id} ) ORDER BY person_id ASC'''.format( dataset_id=self.input_dataset_id, nyc_person_table_id=nyc_person_table_id, pitt_person_table_id=pitt_person_table_id) response = bq_utils.query(q) expected_rows = bq_utils.response2rows(response) person_table_id = ehr_union.output_table_for('person') q = '''SELECT DISTINCT person_id FROM {dataset_id}.{table_id} ORDER BY person_id ASC'''.format( dataset_id=self.output_dataset_id, table_id=person_table_id) response = bq_utils.query(q) actual_rows = bq_utils.response2rows(response) self.assertCountEqual(expected_rows, actual_rows)
def query_rows(query): response = bq_utils.query(query) return bq_utils.response2rows(response)