def setUp(self): self.testbed = testbed.Testbed() self.testbed.activate() self.testbed.init_app_identity_stub() self.testbed.init_memcache_stub() self.testbed.init_urlfetch_stub() self.testbed.init_blobstore_stub() self.testbed.init_datastore_v3_stub() self.project_id = bq_utils.app_identity.get_application_id() self.hpo_ids = [NYC_HPO_ID, PITT_HPO_ID] self.input_dataset_id = bq_utils.get_dataset_id() self.output_dataset_id = bq_utils.get_unioned_dataset_id() self._empty_hpo_buckets() test_util.delete_all_tables(self.input_dataset_id) test_util.delete_all_tables(self.output_dataset_id) # TODO Generalize to work for all foreign key references # Collect all primary key fields in CDM tables mapped_fields = [] for table in cdm.tables_to_map(): field = table + '_id' mapped_fields.append(field) self.mapped_fields = mapped_fields self.implemented_foreign_keys = [ eu_constants.VISIT_OCCURRENCE_ID, eu_constants.CARE_SITE_ID, eu_constants.LOCATION_ID ]
def test_queries_to_retract_from_combined_or_deid_dataset(self, mock_list_existing_tables): existing_table_ids = [] ignored_tables = [] for cdm_table in resources.CDM_TABLES: existing_table_ids.append(cdm_table) if cdm_table not in self.tables_to_retract_combined: ignored_tables.append(cdm_table) mapped_tables = cdm.tables_to_map() for mapped_table in mapped_tables: mapping_table = ehr_union.mapping_table_for(mapped_table) existing_table_ids.append(mapping_table) if mapped_table not in self.tables_to_retract_combined: ignored_tables.append(mapping_table) mock_list_existing_tables.return_value = existing_table_ids mqs, qs = retract_data_bq.queries_to_retract_from_combined_or_deid_dataset(self.project_id, self.combined_dataset_id, self.person_ids) actual_dest_tables = set(q[retract_data_bq.DEST_TABLE] for q in qs+mqs) expected_dest_tables = set(existing_table_ids) - set(ignored_tables) self.assertSetEqual(expected_dest_tables, actual_dest_tables) # death query should use person_id as-is (no constant factor) constant_factor = common.RDR_ID_CONSTANT + common.ID_CONSTANT_FACTOR for q in qs: if q[retract_data_bq.DEST_TABLE] is common.DEATH: self.assertNotIn(str(constant_factor), q[retract_data_bq.QUERY])
def get_id_deduplicate_queries(project_id, dataset_id): """ This function gets the queries required to remove the duplicate id columns from a dataset :param project_id: Project name :param dataset_id: Name of the dataset where a rule should be applied :return: a list of queries. """ queries = [] tables_with_primary_key = cdm.tables_to_map() for table in tables_with_primary_key: if 'unioned' in dataset_id: table_name = 'unioned_ehr_{table}'.format(table=table) else: table_name = table if bq_utils.table_exists(table_name, dataset_id): fields = resources.fields_for(table) # Generate column expressions for select col_exprs = [field['name'] for field in fields] cols = ',\n '.join(col_exprs) query = ID_DE_DUP_QUERY.format(columns=cols, project_id=project_id, dataset_id=dataset_id, domain_table=table, table_name=table_name) queries.append(query) return queries
def get_id_deduplicate_queries(project_id, dataset_id): """ This function gets the queries required to remove the duplicate id columns from a dataset :param project_id: Project name :param dataset_id: Name of the dataset where a rule should be applied :return: a list of queries. """ queries = [] tables_with_primary_key = cdm.tables_to_map() for table in tables_with_primary_key: table_name = table fields = resources.fields_for(table) # Generate column expressions for select col_exprs = [field['name'] for field in fields] cols = ', '.join(col_exprs) query = dict() query[cdr_consts.QUERY] = ID_DE_DUP_QUERY.format(columns=cols, project_id=project_id, dataset_id=dataset_id, domain_table=table, table_name=table_name) query[cdr_consts.DESTINATION_TABLE] = table query[cdr_consts.DISPOSITION] = bq_consts.WRITE_TRUNCATE query[cdr_consts.DESTINATION_DATASET] = dataset_id queries.append(query) return queries
def get_query_result(hpo_id, query_string, table_id, query_wrapper, is_subquery, app_id=None, dataset_id=None): """ :param hpo_id: the name of the hpo_id for which validation is being done :param query_string: variable name of the query string stored in the constants :param table_id: Name of the table running analysis on :param query_wrapper: wrapper over the unioned query if required :param is_subquery: binary flag(true/false) to indicate if parsing is needed or not. :param app_id: name of the big query application id :param dataset_id: name of the big query dataset id :return: returns dictionary of rows """ if app_id is None: app_id = app_identity.get_application_id() if dataset_id is None: dataset_id = bq_utils.get_dataset_id() query = None result = None if is_subquery: sub_queries = [] for table in cdm.tables_to_map(): hpo_table = '{hpo_id}_{table_name}'.format(hpo_id=hpo_id, table_name=table) if bq_utils.table_exists(hpo_table): sub_query = query_string.format(hpo_id=hpo_id, app_id=app_id, dataset_id=dataset_id, domain_table=table) sub_queries.append(sub_query) unioned_query = main_constants.UNION_ALL.join(sub_queries) if unioned_query and query_wrapper is not None: query = query_wrapper.format(union_of_subqueries=unioned_query) else: query = unioned_query else: table_name = '{hpo_name}_{results_table}'.format( hpo_name=hpo_id, results_table=table_id) if bq_utils.table_exists(table_name): query = query_string.format(application=app_id, dataset=dataset_id, table_id=table_name) if query: # Found achilles_heel_results table(s), run the query response = bq_utils.query(query) result = bq_utils.response2rows(response) if result is None: result = [] return result
def main(input_dataset_id, output_dataset_id, project_id, hpo_ids_ex=None): """ Create a new CDM which is the union of all EHR datasets submitted by HPOs :param input_dataset_id identifies a dataset containing multiple CDMs, one for each HPO submission :param output_dataset_id identifies the dataset to store the new CDM in :param project_id: project containing the datasets :param hpo_ids_ex: (optional) list that identifies HPOs not to process, by default process all :returns: list of tables generated successfully """ client = get_client(project_id) logging.info('EHR union started') # Get all hpo_ids. hpo_ids = [item['hpo_id'] for item in bq_utils.get_hpo_info()] if hpo_ids_ex: hpo_ids = [hpo_id for hpo_id in hpo_ids if hpo_id not in hpo_ids_ex] # Create empty output tables to ensure proper schema, clustering, etc. for table in resources.CDM_TABLES: result_table = output_table_for(table) logging.info(f'Creating {output_dataset_id}.{result_table}...') bq_utils.create_standard_table(table, result_table, drop_existing=True, dataset_id=output_dataset_id) # Create mapping tables for domain_table in cdm.tables_to_map(): logging.info(f'Mapping {domain_table}...') mapping(domain_table, hpo_ids, input_dataset_id, output_dataset_id, project_id, client) # Load all tables with union of submitted tables for table_name in resources.CDM_TABLES: logging.info(f'Creating union of table {table_name}...') load(table_name, hpo_ids, input_dataset_id, output_dataset_id) logging.info('Creation of Unioned EHR complete') # create person mapping table domain_table = common.PERSON logging.info(f'Mapping {domain_table}...') mapping(domain_table, hpo_ids, input_dataset_id, output_dataset_id, project_id, client) logging.info('Starting process for Person to Observation') # Map and move EHR person records into four rows in observation, one each for race, ethnicity, dob and gender map_ehr_person_to_observation(output_dataset_id) move_ehr_person_to_observation(output_dataset_id) logging.info('Completed Person to Observation')
def get_query_specs(self, *args, **keyword_args) -> query_spec_list: sandbox_queries = [] # iterate through the list of CDM tables with an id column for table_name in cdm.tables_to_map(): sandbox_queries.append({ cdr_consts.QUERY: ID_DE_DUP_SANDBOX_QUERY_TEMPLATE.render( project_id=self.project_id, dataset_id=self.dataset_id, table_name=table_name), cdr_consts.DESTINATION_TABLE: self.sandbox_table_for(table_name), cdr_consts.DISPOSITION: WRITE_TRUNCATE, cdr_consts.DESTINATION_DATASET: self.sandbox_dataset_id }) queries = [] # iterate through the list of CDM tables with an id column for table_name in cdm.tables_to_map(): queries.append({ cdr_consts.QUERY: ID_DE_DUP_QUERY_TEMPLATE.render(project_id=self.project_id, dataset_id=self.dataset_id, table_name=table_name), cdr_consts.DESTINATION_TABLE: table_name, cdr_consts.DISPOSITION: WRITE_TRUNCATE, cdr_consts.DESTINATION_DATASET: self.dataset_id }) return sandbox_queries + queries
def get_duplicate_counts_query(hpo_id): """ Query to retrieve count of duplicate primary keys in domain tables for an HPO site :param hpo_id: identifies the HPO site :return: the query """ sub_queries = [] all_table_ids = bq_utils.list_all_table_ids() for table_name in cdm.tables_to_map(): table_id = bq_utils.get_table_id(hpo_id, table_name) if table_id in all_table_ids: sub_query = render_query(consts.DUPLICATE_IDS_SUBQUERY, table_name=table_name, table_id=table_id) sub_queries.append(sub_query) unioned_query = consts.UNION_ALL.join(sub_queries) return consts.DUPLICATE_IDS_WRAPPER.format( union_of_subqueries=unioned_query)
def __init__(self, project_id, dataset_id, sandbox_dataset_id): """ Initialize the class with proper information. Set the issue numbers, description and affected datasets. As other tickets may affect this SQL, append them to the list of Jira Issues. DO NOT REMOVE ORIGINAL JIRA ISSUE NUMBERS! """ desc = ( 'Remove the duplicate id columns from OMOP tables that have an ID column ' 'in a given dataset') super().__init__(issue_numbers=JIRA_ISSUE_NUMBERS, description=desc, affected_datasets=[cdr_consts.UNIONED], affected_tables=cdm.tables_to_map(), project_id=project_id, dataset_id=dataset_id, sandbox_dataset_id=sandbox_dataset_id)
def setUp(self): self.project_id = bq_utils.app_identity.get_application_id() self.hpo_ids = [PITT_HPO_ID, NYC_HPO_ID, EXCLUDED_HPO_ID] self.input_dataset_id = bq_utils.get_dataset_id() self.output_dataset_id = bq_utils.get_unioned_dataset_id() self.storage_client = StorageClient(self.project_id) self.tearDown() # TODO Generalize to work for all foreign key references # Collect all primary key fields in CDM tables mapped_fields = [] for table in cdm.tables_to_map(): field = table + '_id' mapped_fields.append(field) self.mapped_fields = mapped_fields self.implemented_foreign_keys = [ eu_constants.VISIT_OCCURRENCE_ID, eu_constants.VISIT_DETAIL_ID, eu_constants.CARE_SITE_ID, eu_constants.LOCATION_ID ]
def test_queries_to_retract_from_ehr_dataset(self, mock_list_existing_tables): hpo_person = bq_utils.get_table_id(self.hpo_id, common.PERSON) hpo_death = bq_utils.get_table_id(self.hpo_id, common.DEATH) # hpo tables existing_table_ids = [hpo_person, hpo_death] for table in self.tables_to_retract_unioned: table_id = bq_utils.get_table_id(self.hpo_id, table) existing_table_ids.append(table_id) # unioned tables ignored_tables = [] for cdm_table in resources.CDM_TABLES: unioned_table_id = retract_data_bq.UNIONED_EHR + cdm_table existing_table_ids.append(unioned_table_id) if cdm_table not in self.tables_to_retract_unioned: ignored_tables.append(unioned_table_id) mapped_tables = cdm.tables_to_map() # fact_relationship does not have pid, is handled separate from other mapped tables for mapped_table in mapped_tables: mapping_table = ehr_union.mapping_table_for(mapped_table) existing_table_ids.append(mapping_table) legacy_mapping_table = retract_data_bq.UNIONED_EHR + mapping_table existing_table_ids.append(legacy_mapping_table) if mapped_table not in self.tables_to_retract_unioned: ignored_tables.append(mapping_table) ignored_tables.append(legacy_mapping_table) mock_list_existing_tables.return_value = existing_table_ids mqs, qs = retract_data_bq.queries_to_retract_from_ehr_dataset( self.project_id, self.ehr_dataset_id, self.project_id, self.sandbox_dataset_id, self.hpo_id, self.pid_table_id) actual_dest_tables = set(q[retract_data_bq.DEST_TABLE] for q in qs + mqs) expected_dest_tables = set(existing_table_ids) - set(hpo_person) - set( ignored_tables) self.assertSetEqual(expected_dest_tables, actual_dest_tables)
def setUp(self): self.project_id = bq_utils.app_identity.get_application_id() self.hpo_ids = [NYC_HPO_ID, PITT_HPO_ID] self.input_dataset_id = bq_utils.get_dataset_id() self.output_dataset_id = bq_utils.get_unioned_dataset_id() # Done in tearDown(). this is redundant. self._empty_hpo_buckets() test_util.delete_all_tables(self.input_dataset_id) test_util.delete_all_tables(self.output_dataset_id) # TODO Generalize to work for all foreign key references # Collect all primary key fields in CDM tables mapped_fields = [] for table in cdm.tables_to_map(): field = table + '_id' mapped_fields.append(field) self.mapped_fields = mapped_fields self.implemented_foreign_keys = [ eu_constants.VISIT_OCCURRENCE_ID, eu_constants.CARE_SITE_ID, eu_constants.LOCATION_ID ]
def test_queries_to_retract_from_unioned_dataset(self, mock_list_existing_tables): existing_table_ids = [] ignored_tables = [] for cdm_table in resources.CDM_TABLES: existing_table_ids.append(cdm_table) if cdm_table not in self.tables_to_retract_unioned: ignored_tables.append(cdm_table) mapped_tables = cdm.tables_to_map() for mapped_table in mapped_tables: mapping_table = ehr_union.mapping_table_for(mapped_table) existing_table_ids.append(mapping_table) if mapped_table not in self.tables_to_retract_unioned: ignored_tables.append(mapping_table) mock_list_existing_tables.return_value = existing_table_ids mqs, qs = retract_data_bq.queries_to_retract_from_unioned_dataset(self.project_id, self.unioned_dataset_id, self.person_ids) actual_dest_tables = set(q[retract_data_bq.DEST_TABLE] for q in qs+mqs) expected_dest_tables = set(existing_table_ids) - set(ignored_tables) self.assertSetEqual(expected_dest_tables, actual_dest_tables)
def test_union_ehr(self): self._load_datasets() input_tables_before = set(self._dataset_tables(self.input_dataset_id)) # output should be mapping tables and cdm tables output_tables_before = self._dataset_tables(self.output_dataset_id) mapping_tables = [ ehr_union.mapping_table_for(table) for table in cdm.tables_to_map() + [combine_ehr_rdr.PERSON_TABLE] ] output_cdm_tables = [ ehr_union.output_table_for(table) for table in resources.CDM_TABLES ] expected_output = set(output_tables_before + mapping_tables + output_cdm_tables) # perform ehr union ehr_union.main(self.input_dataset_id, self.output_dataset_id, self.project_id, self.hpo_ids) # input dataset should be unchanged input_tables_after = set(self._dataset_tables(self.input_dataset_id)) self.assertSetEqual(input_tables_before, input_tables_after) # fact_relationship from pitt hpo_unique_identifiers = ehr_union.get_hpo_offsets(self.hpo_ids) pitt_offset = hpo_unique_identifiers[PITT_HPO_ID] q = '''SELECT fact_id_1, fact_id_2 FROM `{input_dataset}.{hpo_id}_fact_relationship` where domain_concept_id_1 = 21 and domain_concept_id_2 = 21'''.format( input_dataset=self.input_dataset_id, hpo_id=PITT_HPO_ID) response = bq_utils.query(q) result = bq_utils.response2rows(response) expected_fact_id_1 = result[0]["fact_id_1"] + pitt_offset expected_fact_id_2 = result[0]["fact_id_2"] + pitt_offset q = '''SELECT fr.fact_id_1, fr.fact_id_2 FROM `{dataset_id}.unioned_ehr_fact_relationship` fr join `{dataset_id}._mapping_measurement` mm on fr.fact_id_1 = mm.measurement_id and mm.src_hpo_id = "{hpo_id}"'''.format( dataset_id=self.output_dataset_id, hpo_id=PITT_HPO_ID) response = bq_utils.query(q) result = bq_utils.response2rows(response) actual_fact_id_1, actual_fact_id_2 = result[0]["fact_id_1"], result[0][ "fact_id_2"] self.assertEqual(expected_fact_id_1, actual_fact_id_1) self.assertEqual(expected_fact_id_2, actual_fact_id_2) # mapping tables tables_to_map = cdm.tables_to_map() for table_to_map in tables_to_map: mapping_table = ehr_union.mapping_table_for(table_to_map) expected_fields = { 'src_table_id', 'src_%s_id' % table_to_map, '%s_id' % table_to_map, 'src_hpo_id' } mapping_table_info = bq_utils.get_table_info( mapping_table, dataset_id=self.output_dataset_id) mapping_table_fields = mapping_table_info.get('schema', dict()).get( 'fields', []) actual_fields = set([f['name'] for f in mapping_table_fields]) message = 'Table %s has fields %s when %s expected' % ( mapping_table, actual_fields, expected_fields) self.assertSetEqual(expected_fields, actual_fields, message) result_table = ehr_union.output_table_for(table_to_map) expected_num_rows = len(self.expected_tables[result_table]) actual_num_rows = int(mapping_table_info.get('numRows', -1)) message = 'Table %s has %s rows when %s expected' % ( mapping_table, actual_num_rows, expected_num_rows) self.assertEqual(expected_num_rows, actual_num_rows, message) # check for each output table for table_name in resources.CDM_TABLES: # output table exists and row count is sum of those submitted by hpos result_table = ehr_union.output_table_for(table_name) expected_rows = self.expected_tables[result_table] expected_count = len(expected_rows) table_info = bq_utils.get_table_info( result_table, dataset_id=self.output_dataset_id) actual_count = int(table_info.get('numRows')) msg = 'Unexpected row count in table {result_table} after ehr union'.format( result_table=result_table) self.assertEqual(expected_count, actual_count, msg) # TODO Compare table rows to expected accounting for the new ids and ignoring field types # q = 'SELECT * FROM {dataset}.{table}'.format(dataset=self.output_dataset_id, table=result_table) # query_response = bq_utils.query(q) # actual_rows = bq_utils.response2rows(query_response) # output table has clustering on person_id where applicable fields_file = os.path.join(resources.fields_path, table_name + '.json') with open(fields_file, 'r') as fp: fields = json.load(fp) field_names = [field['name'] for field in fields] if 'person_id' in field_names: self._table_has_clustering(table_info) actual_output = set(self._dataset_tables(self.output_dataset_id)) self.assertSetEqual(expected_output, actual_output) # explicit check that output person_ids are same as input nyc_person_table_id = bq_utils.get_table_id(NYC_HPO_ID, 'person') pitt_person_table_id = bq_utils.get_table_id(PITT_HPO_ID, 'person') q = '''SELECT DISTINCT person_id FROM ( SELECT person_id FROM {dataset_id}.{nyc_person_table_id} UNION ALL SELECT person_id FROM {dataset_id}.{pitt_person_table_id} ) ORDER BY person_id ASC'''.format( dataset_id=self.input_dataset_id, nyc_person_table_id=nyc_person_table_id, pitt_person_table_id=pitt_person_table_id) response = bq_utils.query(q) expected_rows = bq_utils.response2rows(response) person_table_id = ehr_union.output_table_for('person') q = '''SELECT DISTINCT person_id FROM {dataset_id}.{table_id} ORDER BY person_id ASC'''.format( dataset_id=self.output_dataset_id, table_id=person_table_id) response = bq_utils.query(q) actual_rows = bq_utils.response2rows(response) self.assertCountEqual(expected_rows, actual_rows)
import cdm SOURCE_VALUE_EHR_CONSENT = 'EHRConsentPII_ConsentPermission' CONCEPT_ID_CONSENT_PERMISSION_YES = 1586100 # ConsentPermission_Yes EHR_CONSENT_TABLE_ID = '_ehr_consent' PERSON_TABLE = 'person' PERSON_ID = 'person_id' OBSERVATION_TABLE = 'observation' FOREIGN_KEYS_FIELDS = [ 'visit_occurrence_id', 'location_id', 'care_site_id', 'provider_id' ] RDR_TABLES_TO_COPY = ['person'] EHR_TABLES_TO_COPY = ['death'] DOMAIN_TABLES = list( set(cdm.tables_to_map()) - set(RDR_TABLES_TO_COPY + EHR_TABLES_TO_COPY)) TABLES_TO_PROCESS = RDR_TABLES_TO_COPY + EHR_TABLES_TO_COPY + DOMAIN_TABLES LEFT_JOIN = ( ' LEFT JOIN' ' (' ' SELECT *' ' FROM (' ' SELECT' ' *,' ' row_number() OVER (PARTITION BY {prefix}.{field}, {prefix}.src_hpo_id ) ' ' AS row_num' ' FROM {dataset_id}.{table} {prefix}' ' )' ' WHERE row_num = 1' ' ) {prefix} ON t.{field} = {prefix}.src_{field}' ' AND m.src_dataset_id = {prefix}.src_dataset_id')