def mapping_query(domain_table): """ Returns query used to get mapping of all records from RDR combined with EHR records of consented participants :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence') :return: """ if combine_consts.PERSON_ID in [ field['name'] for field in resources.fields_for(domain_table) ]: return combine_consts.MAPPING_QUERY_WITH_PERSON_CHECK.format( rdr_dataset_id=bq_utils.get_rdr_dataset_id(), ehr_dataset_id=bq_utils.get_dataset_id(), combined_dataset_id=bq_utils.get_combined_dataset_id(), domain_table=domain_table, mapping_constant=common.RDR_ID_CONSTANT, ehr_consent_table_id=combine_consts.EHR_CONSENT_TABLE_ID) else: return combine_consts.MAPPING_QUERY_WITHOUT_PERSON_CHECK.format( rdr_dataset_id=bq_utils.get_rdr_dataset_id(), ehr_dataset_id=bq_utils.get_dataset_id(), combined_dataset_id=bq_utils.get_combined_dataset_id(), domain_table=domain_table, mapping_constant=common.RDR_ID_CONSTANT)
def _all_rdr_records_included(self): """ All rdr records are included whether or not there is corresponding ehr record """ for domain_table in DOMAIN_TABLES: mapping_table = mapping_table_for(domain_table) query = ( 'SELECT rt.{domain_table}_id as id ' 'FROM `{rdr_dataset_id}.{domain_table}` AS rt ' 'LEFT JOIN `{combined_dataset_id}.{mapping_table}` AS m ' 'ON rt.{domain_table}_id = m.src_{domain_table}_id ' 'WHERE ' ' m.{domain_table}_id IS NULL ' 'OR NOT EXISTS ' ' (SELECT 1 FROM `{combined_dataset_id}.{domain_table}` AS t ' ' WHERE t.{domain_table}_id = m.{domain_table}_id)').format( domain_table=domain_table, rdr_dataset_id=bq_utils.get_rdr_dataset_id(), combined_dataset_id=bq_utils.get_combined_dataset_id(), mapping_table=mapping_table) response = bq_utils.query(query) rows = bq_utils.response2rows(response) self.assertEqual( 0, len(rows), "RDR records should map to records in mapping and combined tables" )
def setUp(self): self.project_id = bq_utils.app_identity.get_application_id() self.dataset_id = bq_utils.get_combined_dataset_id() self.sandbox_dataset_id = bq_utils.get_unioned_dataset_id() if not self.project_id or not self.dataset_id: # TODO: Fix handling of globals, push these assertions down if they are required. raise ValueError( f"missing configuration for project ('{self.project_id}') " + f"and/or dataset ('{self.dataset_id}')") # TODO: Reconcile this with a consistent integration testing model. Ideally each test should # clean up after itself so that we don't need this defensive check. test_util.delete_all_tables(self.dataset_id) # drop concept table drop_concept_table(self.dataset_id) create_tables = ['person', 'observation'] table_fields = { 'person': 'post_deid_person', 'observation': 'observation', 'concept': 'concept' } for tbl in ['concept']: if not bq_utils.table_exists(tbl, dataset_id=self.dataset_id): create_tables.append(tbl) for tbl in create_tables: bq_utils.create_standard_table(table_fields[tbl], tbl, dataset_id=self.dataset_id, force_all_nullable=True)
def clean_combined_dataset(project_id=None, dataset_id=None): """ Run all clean rules defined for the ehr and rdr dataset. :param project_id: Name of the BigQuery project. :param dataset_id: Name of the dataset to clean """ if project_id is None: project_id = app_identity.get_application_id() LOGGER.info('Project is unspecified. Using default value of:\t%s', project_id) if dataset_id is None: dataset_id = bq_utils.get_combined_dataset_id() LOGGER.info('Dataset is unspecified. Using default value of:\t%s', dataset_id) sandbox_dataset_id = sandbox.create_sandbox_dataset(project_id=project_id, dataset_id=dataset_id) query_list = _gather_combined_queries(project_id, dataset_id, sandbox_dataset_id) LOGGER.info("Cleaning combined_dataset") clean_engine.clean_dataset(project_id, query_list, stage.COMBINED)
def validate_pii(): project = bq_utils.app_identity.get_application_id() combined_dataset = bq_utils.get_combined_dataset_id() ehr_dataset = bq_utils.get_dataset_id() dest_dataset = bq_utils.get_validation_results_dataset_id() logging.info('Calling match_participants') _, errors = matching.match_participants(project, combined_dataset, ehr_dataset, dest_dataset) if errors > 0: logging.error("Errors encountered in validation process") return consts.VALIDATION_SUCCESS
def fact_relationship_query(): """ Load fact_relationship, using mapped IDs based on domain concept in fact 1 and fact 2 :return: """ return combine_consts.FACT_RELATIONSHIP_QUERY.format( rdr_dataset_id=bq_utils.get_rdr_dataset_id(), combined_dataset_id=bq_utils.get_combined_dataset_id(), mapping_measurement=mapping_table_for('measurement'), ehr_dataset=bq_utils.get_dataset_id(), mapping_observation=mapping_table_for('observation'), measurement_domain_concept_id=common.MEASUREMENT_DOMAIN_CONCEPT_ID, observation_domain_concept_id=common.OBSERVATION_DOMAIN_CONCEPT_ID)
def person_query(table_name): """ Maps location and care_Site id in person table :return: query """ combined_dataset_id = bq_utils.get_combined_dataset_id() columns, join_expression = join_expression_generator( table_name, combined_dataset_id) return combine_consts.MAPPED_PERSON_QUERY.format( cols=columns, dataset=combined_dataset_id, table=table_name, join_expr=join_expression)
def create_cdm_tables(): """ Create all CDM tables Note: Recreates any existing tables """ combined_dataset_id = bq_utils.get_combined_dataset_id() for table in resources.CDM_TABLES: logger.info('Creating table {dataset}.{table}...'.format( table=table, dataset=combined_dataset_id)) bq_utils.create_standard_table(table, table, drop_existing=True, dataset_id=combined_dataset_id)
def query(q, dst_table_id, write_disposition='WRITE_APPEND'): """ Run query and block until job is done :param q: SQL statement :param dst_table_id: if set, output is saved in a table with the specified id :param write_disposition: WRITE_TRUNCATE, WRITE_EMPTY, or WRITE_APPEND (default, to preserve schema) """ dst_dataset_id = bq_utils.get_combined_dataset_id() query_job_result = bq_utils.query(q, destination_table_id=dst_table_id, write_disposition=write_disposition, destination_dataset_id=dst_dataset_id) query_job_id = query_job_result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([query_job_id]) if len(incomplete_jobs) > 0: raise bq_utils.BigQueryJobWaitError(incomplete_jobs)
def test_execute_queries(self): project_id = bq_utils.app_identity.get_application_id() dataset_id = bq_utils.get_combined_dataset_id() sandbox_id = bq_utils.get_unioned_dataset_id() test_util.delete_all_tables(dataset_id) create_tables = ( ['person'] + common.CLINICAL_DATA_TABLES + ['_mapping_' + t for t in common.MAPPED_CLINICAL_DATA_TABLES]) # TODO(calbach): Make the setup/teardown of these concept tables hermetic. for tbl in ['concept', 'concept_ancestor']: if not bq_utils.table_exists(tbl, dataset_id=dataset_id): create_tables.push(tbl) for tbl in create_tables: bq_utils.create_standard_table(tbl, tbl, dataset_id=dataset_id, force_all_nullable=True) for tmpl in INSERT_FAKE_PARTICIPANTS_TMPLS: resp = bq_utils.query( tmpl.render(project_id=project_id, dataset_id=dataset_id, rdr_basics_concept_id=123, rdr_consent_concept_id=345, ehr_obs_concept_id=567, rdr_basics_module_concept_id= drop_participants_without_ppi_or_ehr. BASICS_MODULE_CONCEPT_ID)) self.assertTrue(resp["jobComplete"]) clean_cdr_engine.clean_dataset( project_id, dataset_id, sandbox_id, [(drop_participants_without_ppi_or_ehr.get_queries, )]) def table_to_person_ids(t): rows = bq_utils.response2rows( bq_utils.query("SELECT person_id FROM `{}.{}.{}`".format( project_id, dataset_id, t))) return set([r["person_id"] for r in rows]) # We expect participants 1, 5 to have been removed from all tables. self.assertEqual(set([2, 3, 4, 6]), table_to_person_ids("person")) self.assertEqual(set([2, 4, 6]), table_to_person_ids("observation")) self.assertEquals(set([3, 4]), table_to_person_ids("drug_exposure")) test_util.delete_all_tables(dataset_id)
def copy_ehr_table(table): """ Copy table from EHR (consenting participants only) to the combined dataset without regenerating ids Note: Overwrites if a table already exists """ fields = resources.fields_for(table) field_names = [field['name'] for field in fields] if 'person_id' not in field_names: raise RuntimeError( 'Cannot copy EHR table {table}. It is missing columns needed for consent filter' .format(table=table)) q = combine_consts.COPY_EHR_QUERY.format( ehr_dataset_id=bq_utils.get_dataset_id(), table=table, ehr_consent_table_id=combine_consts.EHR_CONSENT_TABLE_ID, combined_dataset_id=bq_utils.get_combined_dataset_id()) logger.info('Query for {table} is `{q}`'.format(table=table, q=q)) query(q, table)
def load_query(domain_table): """ Returns query used to load a domain table :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence') :return: """ rdr_dataset_id = bq_utils.get_rdr_dataset_id() ehr_dataset_id = bq_utils.get_dataset_id() combined_dataset_id = bq_utils.get_combined_dataset_id() mapping_table = mapping_table_for(domain_table) cols, join_expression = join_expression_generator(domain_table, combined_dataset_id) return combine_consts.LOAD_QUERY.format( cols=cols, domain_table=domain_table, rdr_dataset_id=rdr_dataset_id, ehr_dataset_id=ehr_dataset_id, mapping_table=mapping_table, join_expr=join_expression, combined_dataset_id=combined_dataset_id)
def setUp(self): self.ehr_dataset_id = bq_utils.get_dataset_id() self.rdr_dataset_id = bq_utils.get_rdr_dataset_id() self.combined_dataset_id = bq_utils.get_combined_dataset_id() self.drc_bucket = gcs_utils.get_drc_bucket() test_util.delete_all_tables(self.combined_dataset_id)