def mapping_query(domain_table): """ Returns query used to get mapping of all records from RDR combined with EHR records of consented participants :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence') :return: """ if combine_consts.PERSON_ID in [ field['name'] for field in resources.fields_for(domain_table) ]: return combine_consts.MAPPING_QUERY_WITH_PERSON_CHECK.format( rdr_dataset_id=bq_utils.get_rdr_dataset_id(), ehr_dataset_id=bq_utils.get_dataset_id(), ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(), domain_table=domain_table, mapping_constant=common.RDR_ID_CONSTANT, ehr_consent_table_id=combine_consts.EHR_CONSENT_TABLE_ID) else: return combine_consts.MAPPING_QUERY_WITHOUT_PERSON_CHECK.format( rdr_dataset_id=bq_utils.get_rdr_dataset_id(), ehr_dataset_id=bq_utils.get_dataset_id(), ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(), domain_table=domain_table, mapping_constant=common.RDR_ID_CONSTANT)
def get_hpo_missing_pii_query(hpo_id): """ Query to retrieve counts of drug classes in an HPO site's drug_exposure table :param hpo_id: identifies the HPO site :return: the query """ person_table_id = bq_utils.get_table_id(hpo_id, common.PERSON) pii_name_table_id = bq_utils.get_table_id(hpo_id, common.PII_NAME) pii_wildcard = bq_utils.get_table_id(hpo_id, common.PII_WILDCARD) participant_match_table_id = bq_utils.get_table_id( hpo_id, common.PARTICIPANT_MATCH) rdr_dataset_id = bq_utils.get_rdr_dataset_id() rdr_date = extract_date_from_rdr_dataset_id(rdr_dataset_id) ehr_no_rdr_with_date = consts.EHR_NO_RDR.format(date=rdr_date) rdr_person_table_id = common.PERSON return render_query( consts.MISSING_PII_QUERY, person_table_id=person_table_id, rdr_dataset_id=rdr_dataset_id, rdr_person_table_id=rdr_person_table_id, ehr_no_pii=consts.EHR_NO_PII, ehr_no_rdr=ehr_no_rdr_with_date, pii_no_ehr=consts.PII_NO_EHR, ehr_no_participant_match=consts.EHR_NO_PARTICIPANT_MATCH, pii_name_table_id=pii_name_table_id, pii_wildcard=pii_wildcard, participant_match_table_id=participant_match_table_id)
def mapping_query(domain_table): """ Returns query used to get mapping of all records from RDR combined with EHR records of consented participants :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence') :return: """ return '''SELECT DISTINCT '{rdr_dataset_id}' AS src_dataset_id, {domain_table}_id AS src_{domain_table}_id, 'rdr' as src_hpo_id, {domain_table}_id + {mapping_constant} AS {domain_table}_id FROM {rdr_dataset_id}.{domain_table} UNION ALL SELECT DISTINCT '{ehr_dataset_id}' AS src_dataset_id, t.{domain_table}_id AS src_{domain_table}_id, v.src_hpo_id AS src_hpo_id, t.{domain_table}_id AS {domain_table}_id FROM {ehr_dataset_id}.{domain_table} t JOIN {ehr_dataset_id}._mapping_{domain_table} v on t.{domain_table}_id = v.{domain_table}_id WHERE EXISTS (SELECT 1 FROM {ehr_rdr_dataset_id}.{ehr_consent_table_id} c WHERE t.person_id = c.person_id) '''.format(rdr_dataset_id=bq_utils.get_rdr_dataset_id(), ehr_dataset_id=bq_utils.get_dataset_id(), ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(), domain_table=domain_table, mapping_constant=common.RDR_ID_CONSTANT, ehr_consent_table_id=EHR_CONSENT_TABLE_ID)
def fact_relationship_query(): """ Load fact_relationship, using mapped IDs based on domain concept in fact 1 and fact 2 :return: """ return ''' SELECT fr.domain_concept_id_1 AS domain_concept_id_1, CASE WHEN domain_concept_id_1 = 21 THEN m1.measurement_id WHEN domain_concept_id_1 = 27 THEN o1.observation_id END AS fact_id_1, fr.domain_concept_id_2, CASE WHEN domain_concept_id_2 = 21 THEN m2.measurement_id WHEN domain_concept_id_2 = 27 THEN o2.observation_id END AS fact_id_2, fr.relationship_concept_id AS relationship_concept_id FROM {rdr_dataset_id}.fact_relationship fr LEFT JOIN {combined_dataset_id}.{mapping_measurement} m1 ON m1.src_measurement_id = fr.fact_id_1 AND fr.domain_concept_id_1=21 LEFT JOIN {combined_dataset_id}.{mapping_observation} o1 ON o1.src_observation_id = fr.fact_id_1 AND fr.domain_concept_id_1=27 LEFT JOIN {combined_dataset_id}.{mapping_measurement} m2 ON m2.src_measurement_id = fr.fact_id_2 AND fr.domain_concept_id_2=21 LEFT JOIN {combined_dataset_id}.{mapping_observation} o2 ON o2.src_observation_id = fr.fact_id_2 AND fr.domain_concept_id_2=27 '''.format(rdr_dataset_id=bq_utils.get_rdr_dataset_id(), combined_dataset_id=bq_utils.get_ehr_rdr_dataset_id(), mapping_measurement=mapping_table_for('measurement'), mapping_observation=mapping_table_for('observation'))
def _all_rdr_records_included(self): """ All rdr records are included whether or not there is corresponding ehr record """ for domain_table in DOMAIN_TABLES: mapping_table = mapping_table_for(domain_table) q = '''SELECT rt.{domain_table}_id as id FROM {rdr_dataset_id}.{domain_table} rt LEFT JOIN {ehr_rdr_dataset_id}.{mapping_table} m ON rt.{domain_table}_id = m.src_{domain_table}_id WHERE m.{domain_table}_id IS NULL OR NOT EXISTS (SELECT 1 FROM {ehr_rdr_dataset_id}.{domain_table} t WHERE t.{domain_table}_id = m.{domain_table}_id)'''.format( domain_table=domain_table, rdr_dataset_id=bq_utils.get_rdr_dataset_id(), ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(), mapping_table=mapping_table) response = bq_utils.query(q) rows = test_util.response2rows(response) self.assertEqual( 0, len(rows), "RDR records should map to records in mapping and combined tables" )
def tearDownClass(cls): ehr_dataset_id = bq_utils.get_dataset_id() rdr_dataset_id = bq_utils.get_rdr_dataset_id() test_util.delete_all_tables(ehr_dataset_id) test_util.delete_all_tables(rdr_dataset_id) cls.testbed.deactivate() logger.handlers = []
def clean_rdr_dataset(project_id=None, dataset_id=None): """ Run all clean rules defined for the rdr dataset. :param project_id: Name of the BigQuery project. :param dataset_id: Name of the dataset to clean """ if project_id is None: project_id = app_identity.get_application_id() LOGGER.info('Project is unspecified. Using default value of:\t%s', project_id) if dataset_id is None: dataset_id = bq_utils.get_rdr_dataset_id() LOGGER.info('Dataset is unspecified. Using default value of:\t%s', dataset_id) sandbox_dataset_id = sandbox.create_sandbox_dataset(project_id=project_id, dataset_id=dataset_id) query_list = _gather_rdr_queries(project_id, dataset_id, sandbox_dataset_id) LOGGER.info("Cleaning rdr_dataset") clean_engine.clean_dataset(project_id, query_list, stage.RDR)
def ehr_consent_query(): """ Returns query used to get the `person_id` of only those participants who have consented to share EHR data :return: """ # Consenting are strictly those whose *most recent* (based on observation_datetime) consent record is YES # If the most recent record is NO or NULL, they are NOT consenting return ''' WITH ordered_response AS (SELECT person_id, value_source_concept_id, observation_datetime, ROW_NUMBER() OVER(PARTITION BY person_id ORDER BY observation_datetime DESC, value_source_concept_id ASC) AS rn FROM {dataset_id}.observation WHERE observation_source_value = '{source_value_ehr_consent}') SELECT person_id FROM ordered_response WHERE rn = 1 AND value_source_concept_id = {concept_id_consent_permission_yes} '''.format( dataset_id=bq_utils.get_rdr_dataset_id(), source_value_ehr_consent=SOURCE_VALUE_EHR_CONSENT, concept_id_consent_permission_yes=CONCEPT_ID_CONSENT_PERMISSION_YES)
def _all_rdr_records_included(self): """ All rdr records are included whether or not there is corresponding ehr record """ for domain_table in DOMAIN_TABLES: mapping_table = mapping_table_for(domain_table) query = ( 'SELECT rt.{domain_table}_id as id ' 'FROM `{rdr_dataset_id}.{domain_table}` AS rt ' 'LEFT JOIN `{combined_dataset_id}.{mapping_table}` AS m ' 'ON rt.{domain_table}_id = m.src_{domain_table}_id ' 'WHERE ' ' m.{domain_table}_id IS NULL ' 'OR NOT EXISTS ' ' (SELECT 1 FROM `{combined_dataset_id}.{domain_table}` AS t ' ' WHERE t.{domain_table}_id = m.{domain_table}_id)').format( domain_table=domain_table, rdr_dataset_id=bq_utils.get_rdr_dataset_id(), combined_dataset_id=bq_utils.get_combined_dataset_id(), mapping_table=mapping_table) response = bq_utils.query(query) rows = bq_utils.response2rows(response) self.assertEqual( 0, len(rows), "RDR records should map to records in mapping and combined tables" )
def mapping_query(domain_table): """ Returns query used to get mapping of all records from RDR combined with EHR records of consented participants :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence') :return: """ return ''' WITH all_records AS ( SELECT '{rdr_dataset_id}' AS src_dataset_id, {domain_table}_id AS src_{domain_table}_id FROM {rdr_dataset_id}.{domain_table} UNION ALL SELECT '{ehr_dataset_id}' AS src_dataset_id, {domain_table}_id AS src_{domain_table}_id FROM {ehr_dataset_id}.{domain_table} t WHERE EXISTS (SELECT 1 FROM {ehr_rdr_dataset_id}.{ehr_consent_table_id} c WHERE t.person_id = c.person_id) ) SELECT ROW_NUMBER() OVER (ORDER BY src_dataset_id, src_{domain_table}_id) AS {domain_table}_id, src_dataset_id, src_{domain_table}_id FROM all_records '''.format(rdr_dataset_id=bq_utils.get_rdr_dataset_id(), ehr_dataset_id=bq_utils.get_dataset_id(), ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(), domain_table=domain_table, ehr_consent_table_id=EHR_CONSENT_TABLE_ID)
def setUp(self): self.project_id = app_identity.get_application_id() self.dataset_id = bq_utils.get_rdr_dataset_id() self.sandbox_dataset_id = sandbox.get_sandbox_dataset_id( self.dataset_id) sandbox.check_and_create_sandbox_dataset(self.project_id, self.dataset_id)
def setUp(self): super(CombineEhrRdrTest, self).setUp() self.APP_ID = bq_utils.app_identity.get_application_id() self.ehr_dataset_id = bq_utils.get_dataset_id() self.rdr_dataset_id = bq_utils.get_rdr_dataset_id() self.combined_dataset_id = bq_utils.get_ehr_rdr_dataset_id() self.drc_bucket = gcs_utils.get_drc_bucket() test_util.delete_all_tables(self.combined_dataset_id)
def assert_ehr_and_rdr_tables(): """ Raise assertion error if any CDM tables missing from EHR or RDR dataset """ ehr_dataset_id = bq_utils.get_dataset_id() assert_tables_in(ehr_dataset_id) rdr_dataset_id = bq_utils.get_rdr_dataset_id() assert_tables_in(rdr_dataset_id)
def copy_rdr_table(table): """ Copy table from the RDR dataset to the combined dataset Note: Overwrites if a table already exists """ q = '''SELECT * FROM {rdr_dataset_id}.{table}'''.format(rdr_dataset_id=bq_utils.get_rdr_dataset_id(), table=table) logger.debug('Query for {table} is `{q}`'.format(table=table, q=q)) query(q, table)
def get_unconsented_ehr_records_count(self, table_name): q = UNCONSENTED_EHR_COUNTS_QUERY.format(rdr_dataset_id=bq_utils.get_rdr_dataset_id(), ehr_dataset_id=bq_utils.get_dataset_id(), ehr_rdr_dataset_id=self.combined_dataset_id, domain_table=table_name, ehr_consent_table_id='_ehr_consent') response = bq_utils.query(q) rows = bq_utils.response2rows(response) return rows[0]['n']
def copy_rdr_table(table): """ Copy table from the RDR dataset to the combined dataset Note: Overwrites if a table already exists """ q = combine_consts.COPY_RDR_QUERY.format( rdr_dataset_id=bq_utils.get_rdr_dataset_id(), table=table) logger.debug('Query for {table} is `{q}`'.format(table=table, q=q)) query(q, table)
def clean_rdr_dataset(project=None, dataset=None): if dataset is None or dataset == '' or dataset.isspace(): dataset = bq_utils.get_rdr_dataset_id() LOGGER.info('Dataset is unspecified. Using default value of:\t%s', dataset) query_list = _gather_rdr_queries(project, dataset) LOGGER.info("Cleaning rdr_dataset") clean_engine.clean_dataset(project, dataset, query_list)
def test_mapping_query(self): table_name = 'visit_occurrence' q = mapping_query(table_name) expected_query = EXPECTED_MAPPING_QUERY.format(rdr_dataset_id=bq_utils.get_rdr_dataset_id(), ehr_dataset_id=bq_utils.get_dataset_id(), ehr_rdr_dataset_id=self.combined_dataset_id, domain_table=table_name, mapping_constant=common.RDR_ID_CONSTANT, ehr_consent_table_id=EHR_CONSENT_TABLE_ID) self.assertEqual(expected_query, q, "Mapping query for \n {q} \n to is not as expected".format(q=q))
def setUpClass(cls): print('**************************************************************') print(cls.__name__) print('**************************************************************') # TODO base class this ehr_dataset_id = bq_utils.get_dataset_id() rdr_dataset_id = bq_utils.get_rdr_dataset_id() test_util.delete_all_tables(ehr_dataset_id) test_util.delete_all_tables(rdr_dataset_id) cls.load_dataset_from_files(ehr_dataset_id, test_util.NYC_FIVE_PERSONS_PATH, True) cls.load_dataset_from_files(rdr_dataset_id, test_util.RDR_PATH)
def ehr_consent_query(): """ Returns query used to get the `person_id` of only those participants who have consented to share EHR data :return: """ # Consenting are strictly those whose *most recent* (based on observation_datetime) consent record is YES # If the most recent record is NO or NULL, they are NOT consenting return combine_consts.EHR_CONSENT_QUERY.format( dataset_id=bq_utils.get_rdr_dataset_id(), source_value_ehr_consent=combine_consts.SOURCE_VALUE_EHR_CONSENT, concept_id_consent_permission_yes=combine_consts. CONCEPT_ID_CONSENT_PERMISSION_YES)
def fact_relationship_query(): """ Load fact_relationship, using mapped IDs based on domain concept in fact 1 and fact 2 :return: """ return combine_consts.FACT_RELATIONSHIP_QUERY.format( rdr_dataset_id=bq_utils.get_rdr_dataset_id(), combined_dataset_id=bq_utils.get_ehr_rdr_dataset_id(), mapping_measurement=mapping_table_for('measurement'), ehr_dataset=bq_utils.get_dataset_id(), mapping_observation=mapping_table_for('observation'), measurement_domain_concept_id=common.MEASUREMENT_DOMAIN_CONCEPT_ID, observation_domain_concept_id=common.OBSERVATION_DOMAIN_CONCEPT_ID)
def setUpClass(cls): print('**************************************************************') print(cls.__name__) print('**************************************************************') # TODO base class this logger.level = logging.INFO stream_handler = logging.StreamHandler(sys.stdout) logger.addHandler(stream_handler) ehr_dataset_id = bq_utils.get_dataset_id() rdr_dataset_id = bq_utils.get_rdr_dataset_id() test_util.delete_all_tables(ehr_dataset_id) test_util.delete_all_tables(rdr_dataset_id) cls.load_dataset_from_files(ehr_dataset_id, test_util.NYC_FIVE_PERSONS_PATH, True) cls.load_dataset_from_files(rdr_dataset_id, test_util.RDR_PATH)
def get_dataset_and_project_names(): """ Get project and dataset names from environment variables. :return: A dictionary of dataset names and project name """ project_and_dataset_names = dict() project_and_dataset_names[clean_cdr_consts.EHR_DATASET] = bq_utils.get_dataset_id() project_and_dataset_names[clean_cdr_consts.UNIONED_EHR_DATASET] = bq_utils.get_unioned_dataset_id() project_and_dataset_names[clean_cdr_consts.RDR_DATASET] = bq_utils.get_rdr_dataset_id() project_and_dataset_names[clean_cdr_consts.EHR_RDR_DATASET] = bq_utils.get_ehr_rdr_dataset_id() project_and_dataset_names[clean_cdr_consts.EHR_RDR_DE_IDENTIFIED] = bq_utils.get_combined_deid_dataset_id() project_and_dataset_names[clean_cdr_consts.PROJECT] = app_identity.get_application_id() return project_and_dataset_names
def clean_rdr_dataset(project_id=None, dataset_id=None): """ Run all clean rules defined for the rdr dataset. :param project_id: Name of the BigQuery project. :param dataset_id: Name of the dataset to clean """ if dataset_id is None or dataset_id == '' or dataset_id.isspace(): dataset_id = bq_utils.get_rdr_dataset_id() LOGGER.info('Dataset is unspecified. Using default value of:\t%s', dataset_id) query_list = _gather_rdr_queries(project_id, dataset_id) LOGGER.info("Cleaning rdr_dataset") clean_engine.clean_dataset(project_id, dataset_id, query_list)
def setUp(self): self.hpo_bucket = gcs_utils.get_hpo_bucket(FAKE_HPO_ID) self.project_id = app_identity.get_application_id() self.dataset_id = bq_utils.get_dataset_id() self.rdr_dataset_id = bq_utils.get_rdr_dataset_id() self.folder_prefix = '2019-01-01/' test_util.delete_all_tables(self.dataset_id) test_util.empty_bucket(self.hpo_bucket) mock_get_hpo_name = mock.patch('validation.main.get_hpo_name') self.mock_get_hpo_name = mock_get_hpo_name.start() self.mock_get_hpo_name.return_value = 'Fake HPO' self.addCleanup(mock_get_hpo_name.stop) self._load_data()
def setUp(self): self.hpo_id = test_util.FAKE_HPO_ID self.hpo_bucket = gcs_utils.get_hpo_bucket(self.hpo_id) self.project_id = app_identity.get_application_id() self.rdr_dataset_id = bq_utils.get_rdr_dataset_id() mock_get_hpo_name = mock.patch('validation.main.get_hpo_name') self.mock_get_hpo_name = mock_get_hpo_name.start() self.mock_get_hpo_name.return_value = 'Fake HPO' self.addCleanup(mock_get_hpo_name.stop) self.bigquery_dataset_id = bq_utils.get_dataset_id() self.folder_prefix = '2019-01-01-v1/' self._empty_bucket() test_util.delete_all_tables(self.bigquery_dataset_id) self._create_drug_class_table(self.bigquery_dataset_id)
def setUpClass(cls): # TODO base class this logger.level = logging.INFO stream_handler = logging.StreamHandler(sys.stdout) logger.addHandler(stream_handler) cls.testbed = testbed.Testbed() cls.testbed.activate() cls.testbed.init_app_identity_stub() cls.testbed.init_memcache_stub() cls.testbed.init_urlfetch_stub() cls.testbed.init_blobstore_stub() cls.testbed.init_datastore_v3_stub() ehr_dataset_id = bq_utils.get_dataset_id() rdr_dataset_id = bq_utils.get_rdr_dataset_id() test_util.delete_all_tables(ehr_dataset_id) test_util.delete_all_tables(rdr_dataset_id) cls.load_dataset_from_files(ehr_dataset_id, test_util.NYC_FIVE_PERSONS_PATH) cls.load_dataset_from_files(rdr_dataset_id, test_util.RDR_PATH)
def setUpClass(cls): print('**************************************************************') print(cls.__name__) print('**************************************************************') super().initialize_class_vars() project_id = app_identity.get_application_id() dataset_id = bq_utils.get_rdr_dataset_id() sandbox_dataset_id = sandbox.get_sandbox_dataset_id(dataset_id) rule = PpiBranching(project_id, dataset_id, sandbox_dataset_id) cls.dataset_id = dataset_id cls.sandbox_dataset_id = sandbox_dataset_id cls.project_id = project_id cls.rule_instance = rule cls.fq_sandbox_table_names = [ _fq_table_name(table) for table in (rule.lookup_table, rule.backup_table) ] cls.fq_table_names = [_fq_table_name(rule.observation_table)] super().setUpClass()
def load_query(domain_table): """ Returns query used to load a domain table :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence') :return: """ rdr_dataset_id = bq_utils.get_rdr_dataset_id() ehr_dataset_id = bq_utils.get_dataset_id() ehr_rdr_dataset_id = bq_utils.get_ehr_rdr_dataset_id() mapping_table = mapping_table_for(domain_table) cols, join_expression = join_expression_generator(domain_table, ehr_rdr_dataset_id) return combine_consts.LOAD_QUERY.format( cols=cols, domain_table=domain_table, rdr_dataset_id=rdr_dataset_id, ehr_dataset_id=ehr_dataset_id, mapping_table=mapping_table, join_expr=join_expression, ehr_rdr_dataset_id=ehr_rdr_dataset_id)
def fact_relationship_query(): """ Load fact_relationship, using mapped IDs based on domain concept in fact 1 and fact 2 :return: """ return ''' SELECT fr.domain_concept_id_1 AS domain_concept_id_1, CASE WHEN domain_concept_id_1 = {measurement_domain_concept_id} THEN m1.measurement_id WHEN domain_concept_id_1 = {observation_domain_concept_id} THEN o1.observation_id END AS fact_id_1, fr.domain_concept_id_2, CASE WHEN domain_concept_id_2 = {measurement_domain_concept_id} THEN m2.measurement_id WHEN domain_concept_id_2 = {observation_domain_concept_id} THEN o2.observation_id END AS fact_id_2, fr.relationship_concept_id AS relationship_concept_id FROM {rdr_dataset_id}.fact_relationship fr LEFT JOIN {combined_dataset_id}.{mapping_measurement} m1 ON m1.src_measurement_id = fr.fact_id_1 AND fr.domain_concept_id_1={measurement_domain_concept_id} LEFT JOIN {combined_dataset_id}.{mapping_observation} o1 ON o1.src_observation_id = fr.fact_id_1 AND fr.domain_concept_id_1={observation_domain_concept_id} LEFT JOIN {combined_dataset_id}.{mapping_measurement} m2 ON m2.src_measurement_id = fr.fact_id_2 AND fr.domain_concept_id_2={measurement_domain_concept_id} LEFT JOIN {combined_dataset_id}.{mapping_observation} o2 ON o2.src_observation_id = fr.fact_id_2 AND fr.domain_concept_id_2={observation_domain_concept_id} '''.format( rdr_dataset_id=bq_utils.get_rdr_dataset_id(), combined_dataset_id=bq_utils.get_ehr_rdr_dataset_id(), mapping_measurement=mapping_table_for('measurement'), mapping_observation=mapping_table_for('observation'), measurement_domain_concept_id=common.MEASUREMENT_DOMAIN_CONCEPT_ID, observation_domain_concept_id=common.OBSERVATION_DOMAIN_CONCEPT_ID)