def test_create_table(self): table_id = 'some_random_table_id' fields = [ dict(name='person_id', type='integer', mode='required'), dict(name='name', type='string', mode='nullable') ] result = bq_utils.create_table(table_id, fields) self.assertTrue('kind' in result) self.assertEqual(result['kind'], 'bigquery#table') table_info = bq_utils.get_table_info(table_id) self._table_has_clustering(table_info)
def test_load_cdm_csv(self): with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp) result = bq_utils.load_cdm_csv(FAKE_HPO_ID, PERSON) self.assertEqual(result['status']['state'], 'RUNNING') load_job_id = result['jobReference']['jobId'] table_id = result['configuration']['load']['destinationTable']['tableId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) self.assertEqual(len(incomplete_jobs), 0, 'loading table {} timed out'.format(table_id)) table_info = bq_utils.get_table_info(table_id) num_rows = table_info.get('numRows') self.assertEqual(num_rows, '5')
def get_table_columns(self, tablename): """ Return a list of columns for the given table name. """ info = bq_utils.get_table_info(tablename, dataset_id=self.idataset) schema = info.get('schema', {}) fields = schema.get('fields') field_names = [] for field in fields: field_names.append(field.get('name')) return field_names
def _mapping_table_checks(self): """ Check mapping tables exist, have correct schema, have expected number of records """ where = ( 'WHERE EXISTS ' ' (SELECT 1 FROM `{combined_dataset_id}.{ehr_consent_table_id}` AS c ' ' WHERE t.person_id = c.person_id)').format( combined_dataset_id=self.combined_dataset_id, ehr_consent_table_id=EHR_CONSENT_TABLE_ID) ehr_counts = test_util.get_table_counts(self.ehr_dataset_id, DOMAIN_TABLES, where) rdr_counts = test_util.get_table_counts(self.rdr_dataset_id) combined_counts = test_util.get_table_counts(self.combined_dataset_id) output_tables = combined_counts.keys() expected_counts = dict() expected_diffs = ['observation'] for table in DOMAIN_TABLES: expected_mapping_table = mapping_table_for(table) self.assertIn(expected_mapping_table, output_tables) expected_fields = resources.fields_for(expected_mapping_table) actual_table_info = bq_utils.get_table_info( expected_mapping_table, self.combined_dataset_id) actual_fields = actual_table_info.get('schema', dict()).get('fields', []) actual_fields_norm = map(test_util.normalize_field_payload, actual_fields) self.assertCountEqual(expected_fields, actual_fields_norm) # Count should be sum of EHR and RDR # (except for tables like observation where extra records are created for demographics) if 'person_id' in [ field.get('name', '') for field in resources.fields_for(table) ]: unconsented_ehr_records = self.get_unconsented_ehr_records_count( table) else: unconsented_ehr_records = 0 actual_count = combined_counts[expected_mapping_table] if table in expected_diffs: expected_count = actual_count else: expected_count = (ehr_counts[table] - unconsented_ehr_records) + rdr_counts[table] expected_counts[expected_mapping_table] = expected_count self.assertDictContainsSubset(expected_counts, combined_counts)
def test_validate_five_persons_success(self, mock_check_cron): prefix = 'dummy-prefix-2018-03-22/' expected_result_items = resources._csv_to_list(test_util.FIVE_PERSONS_SUCCESS_RESULT_CSV) json_export_files = self.get_json_export_files(test_util.FAKE_HPO_ID) # upload all five_persons files for cdm_file in test_util.FIVE_PERSONS_FILES: test_util.write_cloud_file(self.hpo_bucket, cdm_file, prefix=prefix) expected_tables = ['person', 'visit_occurrence', 'condition_occurrence', 'procedure_occurrence', 'drug_exposure', 'measurement'] cdm_files = [table + '.csv' for table in expected_tables] main.app.testing = True with main.app.test_client() as c: c.get(test_util.VALIDATE_HPO_FILES_URL) # check the result file was put in bucket expected_object_names = cdm_files + common.IGNORE_LIST + json_export_files expected_objects = [prefix + item for item in expected_object_names] list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket) actual_objects = [item['name'] for item in list_bucket_result] self.assertSetEqual(set(expected_objects), set(actual_objects)) # result says file found, parsed, loaded actual_result = test_util.read_cloud_file(self.hpo_bucket, prefix + common.RESULT_CSV) actual_result_file = StringIO.StringIO(actual_result) actual_result_items = resources._csv_file_to_list(actual_result_file) expected_result_items.sort() actual_result_items.sort() self.assertListEqual(expected_result_items, actual_result_items) self.assertTrue(main.all_required_files_loaded(test_util.FAKE_HPO_ID, folder_prefix=prefix)) # check tables exist and are clustered as expected for table in expected_tables: fields_file = os.path.join(resources.fields_path, table + '.json') table_id = bq_utils.get_table_id(test_util.FAKE_HPO_ID, table) table_info = bq_utils.get_table_info(table_id) with open(fields_file, 'r') as fp: fields = json.load(fp) field_names = [field['name'] for field in fields] if 'person_id' in field_names: self.table_has_clustering(table_info)
def _mapping_table_checks(self): """ Check mapping tables exist, have correct schema, have expected number of records """ where = ''' WHERE EXISTS (SELECT 1 FROM {ehr_rdr_dataset_id}.{ehr_consent_table_id} c WHERE t.person_id = c.person_id) '''.format(ehr_rdr_dataset_id=self.combined_dataset_id, ehr_consent_table_id=EHR_CONSENT_TABLE_ID) ehr_counts = test_util.get_table_counts(self.ehr_dataset_id, DOMAIN_TABLES, where) rdr_counts = test_util.get_table_counts(self.rdr_dataset_id) combined_counts = test_util.get_table_counts(self.combined_dataset_id) output_tables = combined_counts.keys() expected_counts = dict() expected_diffs = ['observation'] self.maxDiff = None for t in DOMAIN_TABLES: expected_mapping_table = mapping_table_for(t) self.assertIn(expected_mapping_table, output_tables) expected_fields = resources.fields_for(expected_mapping_table) actual_table_info = bq_utils.get_table_info( expected_mapping_table, self.combined_dataset_id) actual_fields = actual_table_info.get('schema', dict()).get('fields', []) actual_fields_norm = map(test_util.normalize_field_payload, actual_fields) self.assertItemsEqual(expected_fields, actual_fields_norm) # Count should be sum of EHR and RDR # (except for tables like observation where extra records are created for demographics) actual_count = combined_counts[expected_mapping_table] expected_count = actual_count if t in expected_diffs else ehr_counts[ t] + rdr_counts[t] expected_counts[expected_mapping_table] = expected_count self.assertDictContainsSubset(expected=expected_counts, actual=combined_counts)
def test_union_ehr(self): self._load_datasets() input_tables_before = set(self._dataset_tables(self.input_dataset_id)) # output should be mapping tables and cdm tables output_tables_before = self._dataset_tables(self.output_dataset_id) mapping_tables = [ ehr_union.mapping_table_for(table) for table in ehr_union.tables_to_map() ] output_cdm_tables = [ ehr_union.output_table_for(table) for table in common.CDM_TABLES ] expected_output = set(output_tables_before + mapping_tables + output_cdm_tables) # perform ehr union ehr_union.main(self.input_dataset_id, self.output_dataset_id, self.project_id, self.hpo_ids) # input dataset should be unchanged input_tables_after = set(self._dataset_tables(self.input_dataset_id)) self.assertSetEqual(input_tables_before, input_tables_after) # mapping tables tables_to_map = ehr_union.tables_to_map() for table_to_map in tables_to_map: mapping_table = ehr_union.mapping_table_for(table_to_map) expected_fields = { 'src_table_id', 'src_%s_id' % table_to_map, '%s_id' % table_to_map, 'src_hpo_id' } mapping_table_info = bq_utils.get_table_info( mapping_table, dataset_id=self.output_dataset_id) mapping_table_fields = mapping_table_info.get('schema', dict()).get( 'fields', []) actual_fields = set([f['name'] for f in mapping_table_fields]) message = 'Table %s has fields %s when %s expected' % ( mapping_table, actual_fields, expected_fields) self.assertSetEqual(expected_fields, actual_fields, message) result_table = ehr_union.output_table_for(table_to_map) expected_num_rows = len(self.expected_tables[result_table]) actual_num_rows = int(mapping_table_info.get('numRows', -1)) message = 'Table %s has %s rows when %s expected' % ( mapping_table, actual_num_rows, expected_num_rows) self.assertEqual(expected_num_rows, actual_num_rows, message) # check for each output table for table_name in common.CDM_TABLES: # output table exists and row count is sum of those submitted by hpos result_table = ehr_union.output_table_for(table_name) expected_rows = self.expected_tables[result_table] expected_count = len(expected_rows) table_info = bq_utils.get_table_info( result_table, dataset_id=self.output_dataset_id) actual_count = int(table_info.get('numRows')) msg = 'Unexpected row count in table {result_table} after ehr union'.format( result_table=result_table) self.assertEqual(expected_count, actual_count, msg) # TODO Compare table rows to expected accounting for the new ids and ignoring field types # q = 'SELECT * FROM {dataset}.{table}'.format(dataset=self.output_dataset_id, table=result_table) # query_response = bq_utils.query(q) # actual_rows = test_util.response2rows(query_response) # output table has clustering on person_id where applicable fields_file = os.path.join(resources.fields_path, table_name + '.json') with open(fields_file, 'r') as fp: fields = json.load(fp) field_names = [field['name'] for field in fields] if 'person_id' in field_names: self._table_has_clustering(table_info) actual_output = set(self._dataset_tables(self.output_dataset_id)) self.assertSetEqual(expected_output, actual_output) # explicit check that output person_ids are same as input chs_person_table_id = bq_utils.get_table_id(CHS_HPO_ID, 'person') pitt_person_table_id = bq_utils.get_table_id(PITT_HPO_ID, 'person') q = '''SELECT DISTINCT person_id FROM ( SELECT person_id FROM {dataset_id}.{chs_person_table_id} UNION ALL SELECT person_id FROM {dataset_id}.{pitt_person_table_id} ) ORDER BY person_id ASC'''.format( dataset_id=self.input_dataset_id, chs_person_table_id=chs_person_table_id, pitt_person_table_id=pitt_person_table_id) response = bq_utils.query(q) expected_rows = test_util.response2rows(response) person_table_id = ehr_union.output_table_for('person') q = '''SELECT DISTINCT person_id FROM {dataset_id}.{table_id} ORDER BY person_id ASC'''.format( dataset_id=self.output_dataset_id, table_id=person_table_id) response = bq_utils.query(q) actual_rows = test_util.response2rows(response) self.assertListEqual(expected_rows, actual_rows)
def create_person_id_src_hpo_map(input_dataset, credentials): """ Create a table containing person_ids and src_hpo_ids :param input_dataset: the input dataset to deid :param credentidals: the credentials needed to create a new table. """ map_tablename = "_mapping_person_src_hpos" sql = ("select person_id, src_hpo_id " "from {input_dataset}._mapping_{table} " "join {input_dataset}.{table} " "using ({table}_id) " "where src_hpo_id not like 'rdr'") # list dataset contents dataset_tables = bq_utils.list_dataset_contents(input_dataset) mapping_tables = [] mapped_tables = [] for table in dataset_tables: if table.startswith('_mapping_'): mapping_tables.append(table) mapped_tables.append(table[9:]) # make sure mapped tables all exist check_tables = [] for table in mapped_tables: if table in dataset_tables: check_tables.append(table) # make sure check_tables contain person_id fields person_id_tables = [] for table in check_tables: info = bq_utils.get_table_info(table, dataset_id=input_dataset) schema = info.get('schema', {}) for field_info in schema.get('fields', []): if 'person_id' in field_info.get('name'): person_id_tables.append(table) # revamp mapping tables to contain only mapping tables for tables # with person_id fields mapping_tables = ['_mapping_' + table for table in person_id_tables] sql_statement = [] for table in person_id_tables: sql_statement.append( sql.format(table=table, input_dataset=input_dataset)) final_query = ' UNION ALL '.join(sql_statement) # create the mapping table if map_tablename not in dataset_tables: fields = [{ "type": "integer", "name": "person_id", "mode": "required", "description": "the person_id of someone with an ehr record" }, { "type": "string", "name": "src_hpo_id", "mode": "required", "description": "the src_hpo_id of an ehr record" }] bq_utils.create_table(map_tablename, fields, dataset_id=input_dataset) bq_utils.query(final_query, destination_table_id=map_tablename, destination_dataset_id=input_dataset, write_disposition=bq_consts.WRITE_TRUNCATE) LOGGER.info(f"Created mapping table:\t{input_dataset}.{map_tablename}")
def test_union_ehr(self): self._load_datasets() input_tables_before = set(self._dataset_tables(self.input_dataset_id)) # output should be mapping tables and cdm tables output_tables_before = self._dataset_tables(self.output_dataset_id) mapping_tables = [ ehr_union.mapping_table_for(table) for table in cdm.tables_to_map() + [combine_ehr_rdr.PERSON_TABLE] ] output_cdm_tables = [ ehr_union.output_table_for(table) for table in resources.CDM_TABLES ] expected_output = set(output_tables_before + mapping_tables + output_cdm_tables) # perform ehr union ehr_union.main(self.input_dataset_id, self.output_dataset_id, self.project_id, self.hpo_ids) # input dataset should be unchanged input_tables_after = set(self._dataset_tables(self.input_dataset_id)) self.assertSetEqual(input_tables_before, input_tables_after) # fact_relationship from pitt hpo_unique_identifiers = ehr_union.get_hpo_offsets(self.hpo_ids) pitt_offset = hpo_unique_identifiers[PITT_HPO_ID] q = '''SELECT fact_id_1, fact_id_2 FROM `{input_dataset}.{hpo_id}_fact_relationship` where domain_concept_id_1 = 21 and domain_concept_id_2 = 21'''.format( input_dataset=self.input_dataset_id, hpo_id=PITT_HPO_ID) response = bq_utils.query(q) result = bq_utils.response2rows(response) expected_fact_id_1 = result[0]["fact_id_1"] + pitt_offset expected_fact_id_2 = result[0]["fact_id_2"] + pitt_offset q = '''SELECT fr.fact_id_1, fr.fact_id_2 FROM `{dataset_id}.unioned_ehr_fact_relationship` fr join `{dataset_id}._mapping_measurement` mm on fr.fact_id_1 = mm.measurement_id and mm.src_hpo_id = "{hpo_id}"'''.format( dataset_id=self.output_dataset_id, hpo_id=PITT_HPO_ID) response = bq_utils.query(q) result = bq_utils.response2rows(response) actual_fact_id_1, actual_fact_id_2 = result[0]["fact_id_1"], result[0][ "fact_id_2"] self.assertEqual(expected_fact_id_1, actual_fact_id_1) self.assertEqual(expected_fact_id_2, actual_fact_id_2) # mapping tables tables_to_map = cdm.tables_to_map() for table_to_map in tables_to_map: mapping_table = ehr_union.mapping_table_for(table_to_map) expected_fields = { 'src_table_id', 'src_%s_id' % table_to_map, '%s_id' % table_to_map, 'src_hpo_id' } mapping_table_info = bq_utils.get_table_info( mapping_table, dataset_id=self.output_dataset_id) mapping_table_fields = mapping_table_info.get('schema', dict()).get( 'fields', []) actual_fields = set([f['name'] for f in mapping_table_fields]) message = 'Table %s has fields %s when %s expected' % ( mapping_table, actual_fields, expected_fields) self.assertSetEqual(expected_fields, actual_fields, message) result_table = ehr_union.output_table_for(table_to_map) expected_num_rows = len(self.expected_tables[result_table]) actual_num_rows = int(mapping_table_info.get('numRows', -1)) message = 'Table %s has %s rows when %s expected' % ( mapping_table, actual_num_rows, expected_num_rows) self.assertEqual(expected_num_rows, actual_num_rows, message) # check for each output table for table_name in resources.CDM_TABLES: # output table exists and row count is sum of those submitted by hpos result_table = ehr_union.output_table_for(table_name) expected_rows = self.expected_tables[result_table] expected_count = len(expected_rows) table_info = bq_utils.get_table_info( result_table, dataset_id=self.output_dataset_id) actual_count = int(table_info.get('numRows')) msg = 'Unexpected row count in table {result_table} after ehr union'.format( result_table=result_table) self.assertEqual(expected_count, actual_count, msg) # TODO Compare table rows to expected accounting for the new ids and ignoring field types # q = 'SELECT * FROM {dataset}.{table}'.format(dataset=self.output_dataset_id, table=result_table) # query_response = bq_utils.query(q) # actual_rows = bq_utils.response2rows(query_response) # output table has clustering on person_id where applicable fields_file = os.path.join(resources.fields_path, table_name + '.json') with open(fields_file, 'r') as fp: fields = json.load(fp) field_names = [field['name'] for field in fields] if 'person_id' in field_names: self._table_has_clustering(table_info) actual_output = set(self._dataset_tables(self.output_dataset_id)) self.assertSetEqual(expected_output, actual_output) # explicit check that output person_ids are same as input nyc_person_table_id = bq_utils.get_table_id(NYC_HPO_ID, 'person') pitt_person_table_id = bq_utils.get_table_id(PITT_HPO_ID, 'person') q = '''SELECT DISTINCT person_id FROM ( SELECT person_id FROM {dataset_id}.{nyc_person_table_id} UNION ALL SELECT person_id FROM {dataset_id}.{pitt_person_table_id} ) ORDER BY person_id ASC'''.format( dataset_id=self.input_dataset_id, nyc_person_table_id=nyc_person_table_id, pitt_person_table_id=pitt_person_table_id) response = bq_utils.query(q) expected_rows = bq_utils.response2rows(response) person_table_id = ehr_union.output_table_for('person') q = '''SELECT DISTINCT person_id FROM {dataset_id}.{table_id} ORDER BY person_id ASC'''.format( dataset_id=self.output_dataset_id, table_id=person_table_id) response = bq_utils.query(q) actual_rows = bq_utils.response2rows(response) self.assertCountEqual(expected_rows, actual_rows)
def test_ehr_person_to_observation(self): # ehr person table converts to observation records create_cdm_tables() copy_rdr_table('person') move_ehr_person_to_observation() # person table query q_person = ''' SELECT (person_id, gender_concept_id, gender_source_value, race_concept_id, race_source_value, CAST(birth_datetime as STRING), ethnicity_concept_id, ethnicity_source_value, EXTRACT(DATE FROM birth_datetime)) FROM {ehr_dataset_id}.person '''.format(ehr_dataset_id=self.ehr_dataset_id) response_ehr_person = [[ item['v'] for item in row['f'] ] for row in query_result_to_payload(bq_utils.query(q_person))['F0_']] q_obs = ''' SELECT (person_id, observation_concept_id, value_as_concept_id, value_as_string, observation_source_value, observation_date) FROM {ehr_dataset_id}.observation obs WHERE obs.observation_concept_id=4013886 -- Race - 4013886 OR obs.observation_concept_id=4271761 -- Ethnic group - 4271761 OR obs.observation_concept_id=4135376 -- Gender - 4135376 OR obs.observation_concept_id=4083587 -- DOB - 4083587 '''.format(ehr_dataset_id=self.combined_dataset_id) response_obs = [[ item['v'] for item in row['f'] ] for row in query_result_to_payload(bq_utils.query(q_obs))['F0_']] # concept ids gender_concept_id = '4135376' race_concept_id = '4013886' dob_concept_id = '4083587' ethnicity_concept_id = '4271761' # expected lists expected_gender_list = [(row[0], gender_concept_id, row[1], row[8]) for row in response_ehr_person] expected_race_list = [(row[0], race_concept_id, row[3], row[8]) for row in response_ehr_person] expected_dob_list = [(row[0], dob_concept_id, row[5], row[8]) for row in response_ehr_person] expected_ethnicity_list = [(row[0], ethnicity_concept_id, row[6], row[8]) for row in response_ehr_person] # actual lists actual_gender_list = [(row[0], row[1], row[2], row[5]) for row in response_obs if row[1] == gender_concept_id] actual_race_list = [(row[0], row[1], row[2], row[5]) for row in response_obs if row[1] == race_concept_id] actual_dob_list = [(row[0], row[1], row[3], row[5]) for row in response_obs if row[1] == dob_concept_id] actual_ethnicity_list = [(row[0], row[1], row[2], row[5]) for row in response_obs if row[1] == ethnicity_concept_id] self.assertListEqual(sorted(expected_gender_list), sorted(actual_gender_list), 'gender check fails') self.assertListEqual(sorted(expected_race_list), sorted(actual_race_list), 'race check fails') self.assertListEqual(sorted(expected_dob_list), sorted(actual_dob_list), 'dob check fails') self.assertListEqual(sorted(expected_ethnicity_list), sorted(actual_ethnicity_list), 'ethnicity check fails') person_ehr_row_count = int( bq_utils.get_table_info('person', self.ehr_dataset_id)['numRows']) obs_row_count = int( bq_utils.get_table_info('observation', self.combined_dataset_id)['numRows']) self.assertEqual(person_ehr_row_count * 4, obs_row_count)
def test_merge_EHR(self, mock_check_cron): self._load_datasets() dataset_id = bq_utils.get_dataset_id() old_dataset_items = bq_utils.list_dataset_contents(dataset_id) expected_items = ['visit_id_mapping_table'] expected_items.extend([ ehr_merge.result_table_for(table_name) for table_name in common.CDM_TABLES ]) ehr_merge.merge(dataset_id, self.project_id) # Check row counts for each output table dataset_items = bq_utils.list_dataset_contents(dataset_id) for table_name in common.CDM_TABLES: result_table = ehr_merge.result_table_for(table_name) expected_rows = self.expected_tables[result_table] expected_count = len(expected_rows) table_info = bq_utils.get_table_info(result_table) actual_count = int(table_info.get('numRows')) msg = 'Unexpected row count in table {result_table} after ehr union'.format( result_table=result_table) self.assertEqual(actual_count, expected_count, msg) # Check for clustering if table has person_id fields_file = os.path.join(resources.fields_path, table_name + '.json') with open(fields_file, 'r') as fp: fields = json.load(fp) field_names = [field['name'] for field in fields] if 'person_id' in field_names: self._table_has_clustering(table_info) self.assertSetEqual(set(old_dataset_items + expected_items), set(dataset_items)) table_name = 'condition_occurrence' hpo_id = 'pitt' result_table = ehr_merge.result_table_for(table_name) pitt_table = bq_utils.get_table_id(hpo_id, table_name) cmd_union = 'SELECT * FROM ' + result_table cmd_pitt = 'SELECT * FROM ' + pitt_table cmd_visit_mapping = """ SELECT global_visit_id, mapping_visit_id FROM visit_id_mapping_table WHERE hpo='{hpo_id}'""".format(hpo_id=hpo_id) qr_union = bq_utils.query(cmd_union) qr_pitt = bq_utils.query(cmd_pitt) qr_visit_mapping = bq_utils.query(cmd_visit_mapping) union_result = query_result_to_payload(qr_union) pitt_result = query_result_to_payload(qr_pitt) visit_mapping_result = query_result_to_payload(qr_visit_mapping) def get_element_from_list_of_lists(index, list_of_lists): return [list_item[index] for list_item in list_of_lists] for ind, pitt_visit_id in enumerate( pitt_result['VISIT_OCCURRENCE_ID']): if pitt_visit_id not in visit_mapping_result['MAPPING_VISIT_ID']: continue global_visit_id_index = visit_mapping_result[ 'MAPPING_VISIT_ID'].index(pitt_visit_id) global_visit_id = visit_mapping_result['GLOBAL_VISIT_ID'][ global_visit_id_index] union_visit_id_index = union_result['VISIT_OCCURRENCE_ID'].index( global_visit_id) pitt_cols_without_id = [ values for key, values in pitt_result.items() if key not in [u'VISIT_OCCURRENCE_ID', u'CONDITION_OCCURRENCE_ID'] ] union_cols_without_id = [ values for key, values in union_result.items() if key not in [u'VISIT_OCCURRENCE_ID', u'CONDITION_OCCURRENCE_ID'] ] self.assertListEqual( get_element_from_list_of_lists(ind, pitt_cols_without_id), get_element_from_list_of_lists(union_visit_id_index, union_cols_without_id))