def test_merge_EHR(self, mock_check_cron): self._load_datasets() # enable exception propagation as described at https://goo.gl/LqDgnj old_dataset_items = bq_utils.list_dataset_contents( bq_utils.get_dataset_id()) expected_items = ['visit_id_mapping_table'] expected_items.extend( ['unioned_ehr_' + table_name for table_name in common.CDM_TABLES]) ehr_merge.merge(bq_utils.get_dataset_id(), self.project_id) # check the result files were placed in bucket dataset_items = bq_utils.list_dataset_contents( bq_utils.get_dataset_id()) for table_name in common.CDM_TABLES: cmd = 'SELECT COUNT(1) FROM unioned_ehr_{}'.format(table_name) result = bq_utils.query(cmd) self.assertEqual( int(result['rows'][0]['f'][0]['v']), 2 * globals().get(table_name.upper() + '_COUNT', 0), msg='failed for table unioned_ehr_{}'.format(table_name)) self.assertSetEqual(set(old_dataset_items + expected_items), set(dataset_items)) table_name = 'condition_occurrence' cmd_union = 'SELECT * FROM unioned_ehr_{}'.format(table_name) cmd_pitt = 'SELECT * FROM pitt_{}'.format(table_name) cmd_visit_mapping = "SELECT global_visit_id, mapping_visit_id FROM visit_id_mapping_table where hpo='pitt'" qr_union = bq_utils.query(cmd_union) qr_pitt = bq_utils.query(cmd_pitt) qr_visit_mapping = bq_utils.query(cmd_visit_mapping) union_result = query_result_to_payload(qr_union) pitt_result = query_result_to_payload(qr_pitt) visit_mapping_result = query_result_to_payload(qr_visit_mapping) def get_element_from_list_of_lists(index, list_of_lists): return [list_item[index] for list_item in list_of_lists] for ind, pitt_visit_id in enumerate( pitt_result['VISIT_OCCURRENCE_ID']): if pitt_visit_id not in visit_mapping_result['MAPPING_VISIT_ID']: continue global_visit_id_index = visit_mapping_result[ 'MAPPING_VISIT_ID'].index(pitt_visit_id) global_visit_id = visit_mapping_result['GLOBAL_VISIT_ID'][ global_visit_id_index] union_visit_id_index = union_result['VISIT_OCCURRENCE_ID'].index( global_visit_id) pitt_cols_without_id = [ values for key, values in pitt_result.items() if key not in [u'VISIT_OCCURRENCE_ID', u'CONDITION_OCCURRENCE_ID'] ] union_cols_without_id = [ values for key, values in union_result.items() if key not in [u'VISIT_OCCURRENCE_ID', u'CONDITION_OCCURRENCE_ID'] ] self.assertListEqual( get_element_from_list_of_lists(ind, pitt_cols_without_id), get_element_from_list_of_lists(union_visit_id_index, union_cols_without_id))
def merge(dataset_id, project_id): """merge hpo ehr data :dataset_id: source and target dataset :project_id: project in which everything happens :returns: list of tables generated successfully """ logging.info('Starting merge') existing_tables = bq_utils.list_dataset_contents(dataset_id) hpos_to_merge = [] hpos_with_visit = [] for item in resources.hpo_csv(): hpo_id = item['hpo_id'] if hpo_id + '_person' in existing_tables: hpos_to_merge.append(hpo_id) if hpo_id + '_visit_occurrence' in existing_tables: hpos_with_visit.append(hpo_id) logging.info('HPOs to merge: %s' % hpos_to_merge) logging.info('HPOs with visit_occurrence: %s' % hpos_with_visit) create_mapping_table(hpos_with_visit, project_id, dataset_id) # before loading [drop and] create all tables to ensure they are set up properly for cdm_file_name in common.CDM_FILES: cdm_table_name = cdm_file_name.split('.')[0] result_table = result_table_for(cdm_table_name) bq_utils.create_standard_table(cdm_table_name, result_table, drop_existing=True) jobs_to_wait_on = [] for table_name in common.CDM_TABLES: q = construct_query(table_name, hpos_to_merge, hpos_with_visit, project_id, dataset_id) logging.info('Merging table: ' + table_name) result_table = result_table_for(table_name) query_result = query(q, destination_table_id=result_table, write_disposition='WRITE_TRUNCATE') query_job_id = query_result['jobReference']['jobId'] jobs_to_wait_on.append(query_job_id) incomplete_jobs = bq_utils.wait_on_jobs(jobs_to_wait_on) if len(incomplete_jobs) == 0: tables_created = [] for job_id in jobs_to_wait_on: job_details = bq_utils.get_job_details(job_id) status = job_details['status'] table = job_details['configuration']['query']['destinationTable'][ 'tableId'] if 'errors' in status: logging.error('Job ID %s errors: %s' % (job_id, status['errors'])) else: tables_created.append(table) return tables_created else: message = "Merge failed because job id(s) %s did not complete." % incomplete_jobs logging.error(message) raise RuntimeError(message)
def assert_tables_in(dataset_id): """ Raise assertion error if any CDM tables missing from a dataset :param dataset_id: dataset to check for tables in """ tables = bq_utils.list_dataset_contents(dataset_id) logger.debug('Dataset {dataset_id} has tables: {tables}'.format(dataset_id=dataset_id, tables=tables)) for table in TABLES_TO_PROCESS: if table not in tables: raise RuntimeError( 'Dataset {dataset} is missing table {table}. Aborting.'.format(dataset=dataset_id, table=table))
def tearDown(self): delete_list = ['visit_id_mapping_table'] + [ 'unioned_ehr_' + table_name for table_name in common.CDM_TABLES ] existing_tables = bq_utils.list_dataset_contents( bq_utils.get_dataset_id()) for table_id in delete_list: if table_id not in common.VOCABULARY_TABLES and table_id in existing_tables: bq_utils.delete_table(table_id) self._empty_bucket(self.hpo_bucket) self.testbed.deactivate()
def get_output_tables(input_dataset, known_tables, skip_tables, only_tables): """ Get list of output tables deid should produce. Specifically excludes table names that start with underscores, pii, or are explicitly suppressed. :param input_dataset: dataset to read when gathering all possible table names. :param known_tables: list of tables known to curation. If a table exists in the input dataset but is not known to curation, it is skippped. :param skip_tables: command line csv string of tables to skip for deid. Useful to perform deid on a subset of tables. :return: a list of table names to execute deid over. """ tables = bq_utils.list_dataset_contents(input_dataset) skip_tables = [table.strip() for table in skip_tables.split(',')] only_tables = [table.strip() for table in only_tables.split(',')] allowed_tables = [] for table in tables: if table.startswith('_'): continue if table.startswith('pii'): continue if table in SUPPRESSED_TABLES: continue # doing this to eliminate the 'deid_map' table and any other non-OMOP table if table not in known_tables: continue if table in skip_tables: continue if (only_tables == [''] or table in only_tables) and table in DEID_TABLES: allowed_tables.append(table) return allowed_tables
def match_participants(project, rdr_dataset, ehr_dataset, dest_dataset_id): """ Entry point for performing participant matching of PPI, EHR, and PII data. :param project: a string representing the project name :param rdr_dataset: the dataset created from the results given to us by the rdr team :param ehr_dataset: the dataset containing the pii information for comparisons :param dest_dataset_id: the desired identifier for the match values destination dataset :return: results of the field comparison for each hpo """ LOGGER.info(f"Calling match_participants with:\n" f"project:\t{project}\n" f"rdr_dataset:\t{rdr_dataset}\n" f"ehr_dataset:\t{ehr_dataset}\n" f"dest_dataset_id:\t{dest_dataset_id}\n") ehr_tables = bq_utils.list_dataset_contents(ehr_dataset) date_string = _get_date_string(rdr_dataset) if not re.match(consts.DRC_DATE_REGEX, dest_dataset_id[-8:]): dest_dataset_id += date_string # create new dataset for the intermediate tables and results dataset_result = bq_utils.create_dataset( dataset_id=dest_dataset_id, description=consts.DESTINATION_DATASET_DESCRIPTION.format( version='', rdr_dataset=rdr_dataset, ehr_dataset=ehr_dataset), overwrite_existing=True) validation_dataset = dataset_result.get(bq_consts.DATASET_REF, {}) validation_dataset = validation_dataset.get(bq_consts.DATASET_ID, '') LOGGER.info( f"Created new validation results dataset:\t{validation_dataset}") # create intermediate observation table in new dataset readers.create_match_values_table(project, rdr_dataset, dest_dataset_id) hpo_sites = readers.get_hpo_site_names() #TODO: create a proper config file to store this path field_list = resources.fields_for('identity_match') for site_name in hpo_sites: bq_utils.create_table(site_name + consts.VALIDATION_TABLE_SUFFIX, field_list, drop_existing=True, dataset_id=validation_dataset) read_errors = 0 write_errors = 0 # validate first names for site in hpo_sites: LOGGER.info(f"Beginning identity validation for site: {site}") results = {} try: match_values = None match_values = _compare_name_fields(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_NAME_FIRST, consts.FIRST_NAME_FIELD, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.FIRST_NAME_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.FIRST_NAME_FIELD) LOGGER.info(f"Validated first names for: {site}") # validate last names try: match_values = None match_values = _compare_name_fields(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_NAME_LAST, consts.LAST_NAME_FIELD, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.LAST_NAME_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.LAST_NAME_FIELD) LOGGER.info(f"Validated last names for: {site}") # validate middle names try: match_values = None # match_values = _compare_name_fields( # project, # validation_dataset, # ehr_dataset, # site, # consts.OBS_PII_NAME_MIDDLE, # consts.MIDDLE_NAME_FIELD, # ehr_tables # ) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.MIDDLE_NAME_FIELD} at site: {site}" ), read_errors += 1 else: # write middle name matches for hpo to table # results = _add_matches_to_results(results, match_values, consts.MIDDLE_NAME_FIELD) LOGGER.info("Not validating middle names") # validate zip codes try: match_values = None match_values = _compare_zip_codes( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_ZIP, consts.ZIP_CODE_FIELD, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.ZIP_CODE_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.ZIP_CODE_FIELD) LOGGER.info(f"Validated zip codes for: {site}") # validate city try: match_values = None match_values = _compare_cities(project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_CITY, consts.CITY_FIELD, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.CITY_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.ZIP_CODE_FIELD) LOGGER.info(f"Validated city names for: {site}") # validate state try: match_values = None match_values = _compare_states(project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_STATE, consts.STATE_FIELD, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.STATE_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.STATE_FIELD) LOGGER.info(f"Validated states for: {site}") # validate street addresses try: address_one_matches = None address_two_matches = None match_values = None address_one_matches, address_two_matches = _compare_street_addresses( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_ONE, consts.OBS_PII_STREET_ADDRESS_TWO, consts.ADDRESS_ONE_FIELD, consts.ADDRESS_TWO_FIELD, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for fields: {consts.ADDRESS_ONE_FIELD}, {consts.ADDRESS_TWO_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, address_one_matches, consts.ADDRESS_ONE_FIELD) results = _add_matches_to_results(results, address_two_matches, consts.ADDRESS_TWO_FIELD) LOGGER.info(f"Validated street addresses for: {site}") # validate email addresses try: match_values = None match_values = _compare_email_addresses( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_EMAIL_ADDRESS, consts.EMAIL_FIELD, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.EMAIL_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.EMAIL_FIELD) LOGGER.info(f"Validated email addresses for: {site}") # validate phone numbers try: match_values = None match_values = _compare_phone_numbers(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_PHONE, consts.PHONE_NUMBER_FIELD, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.PHONE_NUMBER_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.PHONE_NUMBER_FIELD) LOGGER.info(f"Validated phone numbers for: {site}") # validate genders try: match_values = None match_values = _compare_genders(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_SEX, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.SEX_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.SEX_FIELD) LOGGER.info(f"Validated genders for: {site}") # validate birth dates try: match_values = None match_values = _compare_birth_dates(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_BIRTH_DATETIME, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.BIRTH_DATETIME_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.BIRTH_DATE_FIELD) LOGGER.info(f"Validated birth dates for: {site}") LOGGER.info(f"Writing results to BQ table") # write dictionary to a table try: writers.write_to_result_table(project, validation_dataset, site, results) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( f"Did not write site information to validation dataset: {site}" ) write_errors += 1 LOGGER.info(f"Wrote validation results for site: {site}") LOGGER.info(f"FINISHED: Validation dataset created: {validation_dataset}") if read_errors > 0: LOGGER.error( f"Encountered {read_errors} read errors creating validation dataset:\t{validation_dataset}" ) if write_errors > 0: LOGGER.error( f"Encountered {write_errors} write errors creating validation dataset:\t{validation_dataset}" ) return read_errors + write_errors
def create_person_id_src_hpo_map(input_dataset, credentials): """ Create a table containing person_ids and src_hpo_ids :param input_dataset: the input dataset to deid :param credentidals: the credentials needed to create a new table. """ map_tablename = "_mapping_person_src_hpos" sql = ("select person_id, src_hpo_id " "from {input_dataset}._mapping_{table} " "join {input_dataset}.{table} " "using ({table}_id) " "where src_hpo_id not like 'rdr'") # list dataset contents dataset_tables = bq_utils.list_dataset_contents(input_dataset) mapping_tables = [] mapped_tables = [] for table in dataset_tables: if table.startswith('_mapping_'): mapping_tables.append(table) mapped_tables.append(table[9:]) # make sure mapped tables all exist check_tables = [] for table in mapped_tables: if table in dataset_tables: check_tables.append(table) # make sure check_tables contain person_id fields person_id_tables = [] for table in check_tables: info = bq_utils.get_table_info(table, dataset_id=input_dataset) schema = info.get('schema', {}) for field_info in schema.get('fields', []): if 'person_id' in field_info.get('name'): person_id_tables.append(table) # revamp mapping tables to contain only mapping tables for tables # with person_id fields mapping_tables = ['_mapping_' + table for table in person_id_tables] sql_statement = [] for table in person_id_tables: sql_statement.append( sql.format(table=table, input_dataset=input_dataset)) final_query = ' UNION ALL '.join(sql_statement) # create the mapping table if map_tablename not in dataset_tables: fields = [{ "type": "integer", "name": "person_id", "mode": "required", "description": "the person_id of someone with an ehr record" }, { "type": "string", "name": "src_hpo_id", "mode": "required", "description": "the src_hpo_id of an ehr record" }] bq_utils.create_table(map_tablename, fields, dataset_id=input_dataset) bq_utils.query(final_query, destination_table_id=map_tablename, destination_dataset_id=input_dataset, write_disposition=bq_consts.WRITE_TRUNCATE) LOGGER.info(f"Created mapping table:\t{input_dataset}.{map_tablename}")
def test_merge_EHR(self, mock_check_cron): self._load_datasets() dataset_id = bq_utils.get_dataset_id() old_dataset_items = bq_utils.list_dataset_contents(dataset_id) expected_items = ['visit_id_mapping_table'] expected_items.extend([ ehr_merge.result_table_for(table_name) for table_name in common.CDM_TABLES ]) ehr_merge.merge(dataset_id, self.project_id) # Check row counts for each output table dataset_items = bq_utils.list_dataset_contents(dataset_id) for table_name in common.CDM_TABLES: result_table = ehr_merge.result_table_for(table_name) expected_rows = self.expected_tables[result_table] expected_count = len(expected_rows) table_info = bq_utils.get_table_info(result_table) actual_count = int(table_info.get('numRows')) msg = 'Unexpected row count in table {result_table} after ehr union'.format( result_table=result_table) self.assertEqual(actual_count, expected_count, msg) # Check for clustering if table has person_id fields_file = os.path.join(resources.fields_path, table_name + '.json') with open(fields_file, 'r') as fp: fields = json.load(fp) field_names = [field['name'] for field in fields] if 'person_id' in field_names: self._table_has_clustering(table_info) self.assertSetEqual(set(old_dataset_items + expected_items), set(dataset_items)) table_name = 'condition_occurrence' hpo_id = 'pitt' result_table = ehr_merge.result_table_for(table_name) pitt_table = bq_utils.get_table_id(hpo_id, table_name) cmd_union = 'SELECT * FROM ' + result_table cmd_pitt = 'SELECT * FROM ' + pitt_table cmd_visit_mapping = """ SELECT global_visit_id, mapping_visit_id FROM visit_id_mapping_table WHERE hpo='{hpo_id}'""".format(hpo_id=hpo_id) qr_union = bq_utils.query(cmd_union) qr_pitt = bq_utils.query(cmd_pitt) qr_visit_mapping = bq_utils.query(cmd_visit_mapping) union_result = query_result_to_payload(qr_union) pitt_result = query_result_to_payload(qr_pitt) visit_mapping_result = query_result_to_payload(qr_visit_mapping) def get_element_from_list_of_lists(index, list_of_lists): return [list_item[index] for list_item in list_of_lists] for ind, pitt_visit_id in enumerate( pitt_result['VISIT_OCCURRENCE_ID']): if pitt_visit_id not in visit_mapping_result['MAPPING_VISIT_ID']: continue global_visit_id_index = visit_mapping_result[ 'MAPPING_VISIT_ID'].index(pitt_visit_id) global_visit_id = visit_mapping_result['GLOBAL_VISIT_ID'][ global_visit_id_index] union_visit_id_index = union_result['VISIT_OCCURRENCE_ID'].index( global_visit_id) pitt_cols_without_id = [ values for key, values in pitt_result.items() if key not in [u'VISIT_OCCURRENCE_ID', u'CONDITION_OCCURRENCE_ID'] ] union_cols_without_id = [ values for key, values in union_result.items() if key not in [u'VISIT_OCCURRENCE_ID', u'CONDITION_OCCURRENCE_ID'] ] self.assertListEqual( get_element_from_list_of_lists(ind, pitt_cols_without_id), get_element_from_list_of_lists(union_visit_id_index, union_cols_without_id))