def mapping_query(domain_table): """ Returns query used to get mapping of all records from RDR combined with EHR records of consented participants :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence') :return: """ if combine_consts.PERSON_ID in [ field['name'] for field in resources.fields_for(domain_table) ]: return combine_consts.MAPPING_QUERY_WITH_PERSON_CHECK.format( rdr_dataset_id=bq_utils.get_rdr_dataset_id(), ehr_dataset_id=bq_utils.get_dataset_id(), ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(), domain_table=domain_table, mapping_constant=common.RDR_ID_CONSTANT, ehr_consent_table_id=combine_consts.EHR_CONSENT_TABLE_ID) else: return combine_consts.MAPPING_QUERY_WITHOUT_PERSON_CHECK.format( rdr_dataset_id=bq_utils.get_rdr_dataset_id(), ehr_dataset_id=bq_utils.get_dataset_id(), ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(), domain_table=domain_table, mapping_constant=common.RDR_ID_CONSTANT)
def _all_rdr_records_included(self): """ All rdr records are included whether or not there is corresponding ehr record """ for domain_table in DOMAIN_TABLES: mapping_table = mapping_table_for(domain_table) q = '''SELECT rt.{domain_table}_id as id FROM {rdr_dataset_id}.{domain_table} rt LEFT JOIN {ehr_rdr_dataset_id}.{mapping_table} m ON rt.{domain_table}_id = m.src_{domain_table}_id WHERE m.{domain_table}_id IS NULL OR NOT EXISTS (SELECT 1 FROM {ehr_rdr_dataset_id}.{domain_table} t WHERE t.{domain_table}_id = m.{domain_table}_id)'''.format( domain_table=domain_table, rdr_dataset_id=bq_utils.get_rdr_dataset_id(), ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(), mapping_table=mapping_table) response = bq_utils.query(q) rows = test_util.response2rows(response) self.assertEqual( 0, len(rows), "RDR records should map to records in mapping and combined tables" )
def fact_relationship_query(): """ Load fact_relationship, using mapped IDs based on domain concept in fact 1 and fact 2 :return: """ return ''' SELECT fr.domain_concept_id_1 AS domain_concept_id_1, CASE WHEN domain_concept_id_1 = 21 THEN m1.measurement_id WHEN domain_concept_id_1 = 27 THEN o1.observation_id END AS fact_id_1, fr.domain_concept_id_2, CASE WHEN domain_concept_id_2 = 21 THEN m2.measurement_id WHEN domain_concept_id_2 = 27 THEN o2.observation_id END AS fact_id_2, fr.relationship_concept_id AS relationship_concept_id FROM {rdr_dataset_id}.fact_relationship fr LEFT JOIN {combined_dataset_id}.{mapping_measurement} m1 ON m1.src_measurement_id = fr.fact_id_1 AND fr.domain_concept_id_1=21 LEFT JOIN {combined_dataset_id}.{mapping_observation} o1 ON o1.src_observation_id = fr.fact_id_1 AND fr.domain_concept_id_1=27 LEFT JOIN {combined_dataset_id}.{mapping_measurement} m2 ON m2.src_measurement_id = fr.fact_id_2 AND fr.domain_concept_id_2=21 LEFT JOIN {combined_dataset_id}.{mapping_observation} o2 ON o2.src_observation_id = fr.fact_id_2 AND fr.domain_concept_id_2=27 '''.format(rdr_dataset_id=bq_utils.get_rdr_dataset_id(), combined_dataset_id=bq_utils.get_ehr_rdr_dataset_id(), mapping_measurement=mapping_table_for('measurement'), mapping_observation=mapping_table_for('observation'))
def mapping_query(domain_table): """ Returns query used to get mapping of all records from RDR combined with EHR records of consented participants :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence') :return: """ return '''SELECT DISTINCT '{rdr_dataset_id}' AS src_dataset_id, {domain_table}_id AS src_{domain_table}_id, 'rdr' as src_hpo_id, {domain_table}_id + {mapping_constant} AS {domain_table}_id FROM {rdr_dataset_id}.{domain_table} UNION ALL SELECT DISTINCT '{ehr_dataset_id}' AS src_dataset_id, t.{domain_table}_id AS src_{domain_table}_id, v.src_hpo_id AS src_hpo_id, t.{domain_table}_id AS {domain_table}_id FROM {ehr_dataset_id}.{domain_table} t JOIN {ehr_dataset_id}._mapping_{domain_table} v on t.{domain_table}_id = v.{domain_table}_id WHERE EXISTS (SELECT 1 FROM {ehr_rdr_dataset_id}.{ehr_consent_table_id} c WHERE t.person_id = c.person_id) '''.format(rdr_dataset_id=bq_utils.get_rdr_dataset_id(), ehr_dataset_id=bq_utils.get_dataset_id(), ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(), domain_table=domain_table, mapping_constant=common.RDR_ID_CONSTANT, ehr_consent_table_id=EHR_CONSENT_TABLE_ID)
def mapping_query(domain_table): """ Returns query used to get mapping of all records from RDR combined with EHR records of consented participants :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence') :return: """ return ''' WITH all_records AS ( SELECT '{rdr_dataset_id}' AS src_dataset_id, {domain_table}_id AS src_{domain_table}_id FROM {rdr_dataset_id}.{domain_table} UNION ALL SELECT '{ehr_dataset_id}' AS src_dataset_id, {domain_table}_id AS src_{domain_table}_id FROM {ehr_dataset_id}.{domain_table} t WHERE EXISTS (SELECT 1 FROM {ehr_rdr_dataset_id}.{ehr_consent_table_id} c WHERE t.person_id = c.person_id) ) SELECT ROW_NUMBER() OVER (ORDER BY src_dataset_id, src_{domain_table}_id) AS {domain_table}_id, src_dataset_id, src_{domain_table}_id FROM all_records '''.format(rdr_dataset_id=bq_utils.get_rdr_dataset_id(), ehr_dataset_id=bq_utils.get_dataset_id(), ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(), domain_table=domain_table, ehr_consent_table_id=EHR_CONSENT_TABLE_ID)
def setUp(self): super(CombineEhrRdrTest, self).setUp() self.APP_ID = bq_utils.app_identity.get_application_id() self.ehr_dataset_id = bq_utils.get_dataset_id() self.rdr_dataset_id = bq_utils.get_rdr_dataset_id() self.combined_dataset_id = bq_utils.get_ehr_rdr_dataset_id() self.drc_bucket = gcs_utils.get_drc_bucket() test_util.delete_all_tables(self.combined_dataset_id)
def create_cdm_tables(): """ Create all CDM tables Note: Recreates any existing tables """ ehr_rdr_dataset_id = bq_utils.get_ehr_rdr_dataset_id() for table in common.CDM_TABLES: logger.debug('Creating table {dataset}.{table}...'.format(table=table, dataset=ehr_rdr_dataset_id)) bq_utils.create_standard_table(table, table, drop_existing=True, dataset_id=ehr_rdr_dataset_id)
def clean_ehr_rdr_dataset(project=None, dataset=None): if dataset is None or dataset == '' or dataset.isspace(): dataset = bq_utils.get_ehr_rdr_dataset_id() LOGGER.info('Dataset is unspecified. Using default value of:\t%s', dataset) query_list = _gather_ehr_rdr_queries(project, dataset) LOGGER.info("Cleaning ehr_rdr_dataset") clean_engine.clean_dataset(project, dataset, query_list)
def validate_pii(): project = bq_utils.app_identity.get_application_id() combined_dataset = bq_utils.get_ehr_rdr_dataset_id() ehr_dataset = bq_utils.get_dataset_id() dest_dataset = bq_utils.get_validation_results_dataset_id() logging.info('Calling match_participants') _, errors = matching.match_participants(project, combined_dataset, ehr_dataset, dest_dataset) if errors > 0: logging.error("Errors encountered in validation process") return consts.VALIDATION_SUCCESS
def fact_relationship_query(): """ Load fact_relationship, using mapped IDs based on domain concept in fact 1 and fact 2 :return: """ return combine_consts.FACT_RELATIONSHIP_QUERY.format( rdr_dataset_id=bq_utils.get_rdr_dataset_id(), combined_dataset_id=bq_utils.get_ehr_rdr_dataset_id(), mapping_measurement=mapping_table_for('measurement'), ehr_dataset=bq_utils.get_dataset_id(), mapping_observation=mapping_table_for('observation'), measurement_domain_concept_id=common.MEASUREMENT_DOMAIN_CONCEPT_ID, observation_domain_concept_id=common.OBSERVATION_DOMAIN_CONCEPT_ID)
def query(q, dst_table_id, write_disposition='WRITE_APPEND'): """ Run query and block until job is done :param q: SQL statement :param dst_table_id: if set, output is saved in a table with the specified id :param write_disposition: WRITE_TRUNCATE, WRITE_EMPTY, or WRITE_APPEND (default, to preserve schema) """ dst_dataset_id = bq_utils.get_ehr_rdr_dataset_id() query_job_result = bq_utils.query(q, destination_table_id=dst_table_id, write_disposition=write_disposition, destination_dataset_id=dst_dataset_id) query_job_id = query_job_result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([query_job_id]) if len(incomplete_jobs) > 0: raise bq_utils.BigQueryJobWaitError(incomplete_jobs)
def person_query(table_name): """ Maps location and care_Site id in person table :return: query """ ehr_rdr_dataset_id = bq_utils.get_ehr_rdr_dataset_id() columns, join_expression = join_expression_generator( table_name, ehr_rdr_dataset_id) return combine_consts.MAPPED_PERSON_QUERY.format( cols=columns, dataset=ehr_rdr_dataset_id, table=table_name, join_expr=join_expression)
def get_dataset_and_project_names(): """ Get project and dataset names from environment variables. :return: A dictionary of dataset names and project name """ project_and_dataset_names = dict() project_and_dataset_names[clean_cdr_consts.EHR_DATASET] = bq_utils.get_dataset_id() project_and_dataset_names[clean_cdr_consts.UNIONED_EHR_DATASET] = bq_utils.get_unioned_dataset_id() project_and_dataset_names[clean_cdr_consts.RDR_DATASET] = bq_utils.get_rdr_dataset_id() project_and_dataset_names[clean_cdr_consts.EHR_RDR_DATASET] = bq_utils.get_ehr_rdr_dataset_id() project_and_dataset_names[clean_cdr_consts.EHR_RDR_DE_IDENTIFIED] = bq_utils.get_combined_deid_dataset_id() project_and_dataset_names[clean_cdr_consts.PROJECT] = app_identity.get_application_id() return project_and_dataset_names
def clean_ehr_rdr_dataset(project_id=None, dataset_id=None): """ Run all clean rules defined for the ehr and rdr dataset. :param project_id: Name of the BigQuery project. :param dataset_id: Name of the dataset to clean """ if dataset_id is None or dataset_id == '' or dataset_id.isspace(): dataset_id = bq_utils.get_ehr_rdr_dataset_id() LOGGER.info('Dataset is unspecified. Using default value of:\t%s', dataset_id) query_list = _gather_ehr_rdr_queries(project_id, dataset_id) LOGGER.info("Cleaning ehr_rdr_dataset") clean_engine.clean_dataset(project_id, dataset_id, query_list)
def copy_ehr_table(table): """ Copy table from EHR (consenting participants only) to the combined dataset without regenerating ids Note: Overwrites if a table already exists """ fields = resources.fields_for(table) field_names = [field['name'] for field in fields] if 'person_id' not in field_names: raise RuntimeError( 'Cannot copy EHR table {table}. It is missing columns needed for consent filter' .format(table=table)) q = combine_consts.COPY_EHR_QUERY.format( ehr_dataset_id=bq_utils.get_dataset_id(), table=table, ehr_consent_table_id=combine_consts.EHR_CONSENT_TABLE_ID, ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id()) logger.debug('Query for {table} is `{q}`'.format(table=table, q=q)) query(q, table)
def load_query(domain_table): """ Returns query used to load a domain table :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence') :return: """ rdr_dataset_id = bq_utils.get_rdr_dataset_id() ehr_dataset_id = bq_utils.get_dataset_id() ehr_rdr_dataset_id = bq_utils.get_ehr_rdr_dataset_id() mapping_table = mapping_table_for(domain_table) cols, join_expression = join_expression_generator(domain_table, ehr_rdr_dataset_id) return combine_consts.LOAD_QUERY.format( cols=cols, domain_table=domain_table, rdr_dataset_id=rdr_dataset_id, ehr_dataset_id=ehr_dataset_id, mapping_table=mapping_table, join_expr=join_expression, ehr_rdr_dataset_id=ehr_rdr_dataset_id)
def fact_relationship_query(): """ Load fact_relationship, using mapped IDs based on domain concept in fact 1 and fact 2 :return: """ return ''' SELECT fr.domain_concept_id_1 AS domain_concept_id_1, CASE WHEN domain_concept_id_1 = {measurement_domain_concept_id} THEN m1.measurement_id WHEN domain_concept_id_1 = {observation_domain_concept_id} THEN o1.observation_id END AS fact_id_1, fr.domain_concept_id_2, CASE WHEN domain_concept_id_2 = {measurement_domain_concept_id} THEN m2.measurement_id WHEN domain_concept_id_2 = {observation_domain_concept_id} THEN o2.observation_id END AS fact_id_2, fr.relationship_concept_id AS relationship_concept_id FROM {rdr_dataset_id}.fact_relationship fr LEFT JOIN {combined_dataset_id}.{mapping_measurement} m1 ON m1.src_measurement_id = fr.fact_id_1 AND fr.domain_concept_id_1={measurement_domain_concept_id} LEFT JOIN {combined_dataset_id}.{mapping_observation} o1 ON o1.src_observation_id = fr.fact_id_1 AND fr.domain_concept_id_1={observation_domain_concept_id} LEFT JOIN {combined_dataset_id}.{mapping_measurement} m2 ON m2.src_measurement_id = fr.fact_id_2 AND fr.domain_concept_id_2={measurement_domain_concept_id} LEFT JOIN {combined_dataset_id}.{mapping_observation} o2 ON o2.src_observation_id = fr.fact_id_2 AND fr.domain_concept_id_2={observation_domain_concept_id} '''.format( rdr_dataset_id=bq_utils.get_rdr_dataset_id(), combined_dataset_id=bq_utils.get_ehr_rdr_dataset_id(), mapping_measurement=mapping_table_for('measurement'), mapping_observation=mapping_table_for('observation'), measurement_domain_concept_id=common.MEASUREMENT_DOMAIN_CONCEPT_ID, observation_domain_concept_id=common.OBSERVATION_DOMAIN_CONCEPT_ID)
def load_query(domain_table): """ Returns query used to load a domain table :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence') :return: """ rdr_dataset_id = bq_utils.get_rdr_dataset_id() ehr_dataset_id = bq_utils.get_dataset_id() ehr_rdr_dataset_id = bq_utils.get_ehr_rdr_dataset_id() mapping_table = mapping_table_for(domain_table) has_visit_occurrence_id = False id_col = '{domain_table}_id'.format(domain_table=domain_table) fields = resources.fields_for(domain_table) # Generate column expressions for select, ensuring that # 1) we get the record IDs from the mapping table and # 2) if there is a reference to `visit_occurrence` we get `visit_occurrence_id` from the mapping visit table col_exprs = [] for field in fields: field_name = field['name'] if field_name == id_col: # Use mapping for record ID column # m is an alias that should resolve to the associated mapping table col_expr = 'm.{field_name} '.format(field_name=field_name) elif field_name == VISIT_OCCURRENCE_ID: # Replace with mapped visit_occurrence_id # mv is an alias that should resolve to the mapping visit table # Note: This is only reached when domain_table != visit_occurrence col_expr = 'mv.' + VISIT_OCCURRENCE_ID has_visit_occurrence_id = True else: col_expr = field_name col_exprs.append(col_expr) cols = ',\n '.join(col_exprs) visit_join_expr = '' if has_visit_occurrence_id: # Include a join to mapping visit table # Note: Using left join in order to keep records that aren't mapped to visits mv = mapping_table_for(VISIT_OCCURRENCE) visit_join_expr = ''' LEFT JOIN {ehr_rdr_dataset_id}.{mapping_visit_occurrence} mv ON t.visit_occurrence_id = mv.src_visit_occurrence_id AND m.src_dataset_id = mv.src_dataset_id'''.format( ehr_rdr_dataset_id=ehr_rdr_dataset_id, mapping_visit_occurrence=mv) return ''' SELECT {cols} FROM {rdr_dataset_id}.{domain_table} t JOIN {ehr_rdr_dataset_id}.{mapping_table} m ON t.{domain_table}_id = m.src_{domain_table}_id {visit_join_expr} WHERE m.src_dataset_id = '{rdr_dataset_id}' UNION ALL SELECT {cols} FROM {ehr_dataset_id}.{domain_table} t JOIN {ehr_rdr_dataset_id}.{mapping_table} m ON t.{domain_table}_id = m.src_{domain_table}_id {visit_join_expr} WHERE m.src_dataset_id = '{ehr_dataset_id}' '''.format(cols=cols, domain_table=domain_table, rdr_dataset_id=rdr_dataset_id, ehr_dataset_id=ehr_dataset_id, mapping_table=mapping_table, visit_join_expr=visit_join_expr, ehr_rdr_dataset_id=ehr_rdr_dataset_id)