def get_lab_concept_summary_query(hpo_id): """ Get the query that checks if the HPO site has submitted the required labs :param hpo_id: :return: """ project_id = app_identity.get_application_id() dataset_id = bq_utils.get_dataset_id() hpo_measurement_table = bq_utils.get_table_id(hpo_id, common.MEASUREMENT) # Create measurement_concept_sets_table if not exist if not bq_utils.table_exists(MEASUREMENT_CONCEPT_SETS_TABLE, dataset_id): load_measurement_concept_sets_table(project_id, dataset_id) # Create measurement_concept_sets_descendants_table if not exist if not bq_utils.table_exists(MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE, dataset_id): load_measurement_concept_sets_descendants_table(project_id, dataset_id) return CHECK_REQUIRED_LAB_QUERY.format( project_id=project_id, ehr_ops_dataset_id=dataset_id, hpo_measurement_table=hpo_measurement_table, measurement_concept_sets_descendants= MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE)
def _load_data(self): # Load measurement_concept_sets required_labs.load_measurement_concept_sets_table( project_id=self.project_id, dataset_id=self.dataset_id) # Load measurement_concept_sets_descendants required_labs.load_measurement_concept_sets_descendants_table( project_id=self.project_id, dataset_id=self.dataset_id) # Create concept and concept_ancestor empty tables if not exist if not bq_utils.table_exists(common.CONCEPT, self.dataset_id): bq_utils.create_standard_table(common.CONCEPT, common.CONCEPT) if not bq_utils.table_exists(common.CONCEPT, self.dataset_id): bq_utils.create_standard_table(common.CONCEPT_ANCESTOR, common.CONCEPT_ANCESTOR) # we need to load measurement.csv into bigquery_dataset_id in advance for the other integration tests ehr_measurement_result = bq_utils.load_table_from_csv( project_id=self.project_id, dataset_id=self.dataset_id, table_name=bq_utils.get_table_id(FAKE_HPO_ID, common.MEASUREMENT), csv_path=test_util.FIVE_PERSONS_MEASUREMENT_CSV, fields=resources.fields_for(common.MEASUREMENT)) bq_utils.wait_on_jobs( [ehr_measurement_result['jobReference']['jobId']])
def test_copy_rdr_tables(self): for table in RDR_TABLES_TO_COPY: self.assertFalse( bq_utils.table_exists( table, self.combined_dataset_id)) # sanity check copy_rdr_table(table) actual = bq_utils.table_exists(table, self.combined_dataset_id) self.assertTrue( actual, msg='RDR table {table} should be copied'.format(table=table)) # Check that row count in combined is same as rdr query = ( 'WITH rdr AS ' ' (SELECT COUNT(1) n FROM `{rdr_dataset_id}.{table}`), ' 'combined AS ' ' (SELECT COUNT(1) n FROM `{combined_dataset_id}.{table}`) ' 'SELECT ' 'rdr.n AS rdr_count, ' 'combined.n AS combined_count ' 'FROM rdr, combined ').format( rdr_dataset_id=self.rdr_dataset_id, combined_dataset_id=self.combined_dataset_id, table=table) response = bq_utils.query(query) rows = bq_utils.response2rows(response) self.assertTrue(len(rows) == 1) # sanity check row = rows[0] rdr_count, combined_count = row['rdr_count'], row['combined_count'] msg_fmt = 'Table {table} has {rdr_count} in rdr and {combined_count} in combined (expected to be equal)' self.assertEqual( rdr_count, combined_count, msg_fmt.format(table=table, rdr_count=rdr_count, combined_count=combined_count))
def test_consented_person_id(self): """ Test observation data has seven (7) persons with consent records as described below 1: No 2: Yes 3: NULL 4: No followed by Yes 5: Yes followed by No 6: Yes followed by NULL 7: NULL and Yes with same date/time """ # sanity check # pre-conditions self.assertFalse( bq_utils.table_exists(EHR_CONSENT_TABLE_ID, self.combined_dataset_id)) # test ehr_consent() # post conditions self.assertTrue( bq_utils.table_exists(EHR_CONSENT_TABLE_ID, self.combined_dataset_id), 'Table {dataset}.{table} created by consented_person'.format( dataset=self.combined_dataset_id, table=EHR_CONSENT_TABLE_ID)) response = bq_utils.query('SELECT * FROM {dataset}.{table}'.format( dataset=self.combined_dataset_id, table=EHR_CONSENT_TABLE_ID)) rows = bq_utils.response2rows(response) expected = {2, 4} actual = set(row['person_id'] for row in rows) self.assertSetEqual( expected, actual, 'Records in {dataset}.{table}'.format( dataset=self.combined_dataset_id, table=EHR_CONSENT_TABLE_ID))
def test_integration_create_drug_route_mappings_table(self): if bq_utils.table_exists(populate_route_ids.DRUG_ROUTES_TABLE_ID, dataset_id=self.dataset_id): bq_utils.delete_table(populate_route_ids.DRUG_ROUTES_TABLE_ID, dataset_id=self.dataset_id) if not bq_utils.table_exists( populate_route_ids.DOSE_FORM_ROUTES_TABLE_ID, dataset_id=self.dataset_id): populate_route_ids.create_dose_form_route_mappings_table( self.project_id, self.dataset_id) populate_route_ids.create_drug_route_mappings_table( self.project_id, self.dataset_id, populate_route_ids.DOSE_FORM_ROUTES_TABLE_ID, self.route_mapping_prefix) time.sleep(10) query = ("SELECT COUNT(*) AS n " "FROM `{project_id}.{dataset_id}.{table_id}`").format( project_id=self.project_id, dataset_id=self.dataset_id, table_id=populate_route_ids.DRUG_ROUTES_TABLE_ID) result = bq_utils.query(query) actual = bq_utils.response2rows(result) self.assertGreater(actual[0]["n"], 0)
def run_analyses(hpo_id): """ Run the achilles analyses :param hpo_id: :return: """ commands = _get_run_analysis_commands(hpo_id) for command in commands: logging.debug(' ---- Running `%s`...\n' % command) if sql_wrangle.is_to_temp_table(command): table_id = sql_wrangle.get_temp_table_name(command) query = sql_wrangle.get_temp_table_query(command) insert_query_job_result = bq_utils.query(query, False, table_id) query_job_id = insert_query_job_result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([query_job_id]) if len(incomplete_jobs) > 0: logging.critical('tempresults doesnt get created in 15 secs') raise RuntimeError('Tempresults taking too long to create') elif sql_wrangle.is_truncate(command): table_id = sql_wrangle.get_truncate_table_name(command) if bq_utils.table_exists(table_id): bq_utils.delete_table(table_id) elif sql_wrangle.is_drop(command): table_id = sql_wrangle.get_drop_table_name(command) if bq_utils.table_exists(table_id): bq_utils.delete_table(table_id) else: bq_utils.query(command)
def test_create_cdm_tables(self): # Sanity check for table in common.CDM_TABLES: self.assertFalse(bq_utils.table_exists(table, self.combined_dataset_id)) create_cdm_tables() for table in common.CDM_TABLES: actual = bq_utils.table_exists(table, self.combined_dataset_id) self.assertTrue(actual, 'Table {table} not created in combined dataset'.format(table=table))
def get_query_result(hpo_id, query_string, table_id, query_wrapper, is_subquery, app_id=None, dataset_id=None): """ :param hpo_id: the name of the hpo_id for which validation is being done :param query_string: variable name of the query string stored in the constants :param table_id: Name of the table running analysis on :param query_wrapper: wrapper over the unioned query if required :param is_subquery: binary flag(true/false) to indicate if parsing is needed or not. :param app_id: name of the big query application id :param dataset_id: name of the big query dataset id :return: returns dictionary of rows """ if app_id is None: app_id = app_identity.get_application_id() if dataset_id is None: dataset_id = bq_utils.get_dataset_id() query = None result = None if is_subquery: sub_queries = [] for table in cdm.tables_to_map(): hpo_table = '{hpo_id}_{table_name}'.format(hpo_id=hpo_id, table_name=table) if bq_utils.table_exists(hpo_table): sub_query = query_string.format(hpo_id=hpo_id, app_id=app_id, dataset_id=dataset_id, domain_table=table) sub_queries.append(sub_query) unioned_query = main_constants.UNION_ALL.join(sub_queries) if unioned_query and query_wrapper is not None: query = query_wrapper.format(union_of_subqueries=unioned_query) else: query = unioned_query else: table_name = '{hpo_name}_{results_table}'.format( hpo_name=hpo_id, results_table=table_id) if bq_utils.table_exists(table_name): query = query_string.format(application=app_id, dataset=dataset_id, table_id=table_name) if query: # Found achilles_heel_results table(s), run the query response = bq_utils.query(query) result = bq_utils.response2rows(response) if result is None: result = [] return result
def get_id_deduplicate_queries(project_id, dataset_id): """ This function gets the queries required to remove the duplicate id columns from a dataset :param project_id: Project name :param dataset_id: Name of the dataset where a rule should be applied :return: a list of queries. """ queries = [] tables_with_primary_key = cdm.tables_to_map() for table in tables_with_primary_key: if 'unioned' in dataset_id: table_name = 'unioned_ehr_{table}'.format(table=table) else: table_name = table if bq_utils.table_exists(table_name, dataset_id): fields = resources.fields_for(table) # Generate column expressions for select col_exprs = [field['name'] for field in fields] cols = ',\n '.join(col_exprs) query = ID_DE_DUP_QUERY.format(columns=cols, project_id=project_id, dataset_id=dataset_id, domain_table=table, table_name=table_name) queries.append(query) return queries
def setUp(self): self.project_id = bq_utils.app_identity.get_application_id() self.dataset_id = bq_utils.get_combined_dataset_id() self.sandbox_dataset_id = bq_utils.get_unioned_dataset_id() if not self.project_id or not self.dataset_id: # TODO: Fix handling of globals, push these assertions down if they are required. raise ValueError( f"missing configuration for project ('{self.project_id}') " + f"and/or dataset ('{self.dataset_id}')") # TODO: Reconcile this with a consistent integration testing model. Ideally each test should # clean up after itself so that we don't need this defensive check. test_util.delete_all_tables(self.dataset_id) # drop concept table drop_concept_table(self.dataset_id) create_tables = ['person', 'observation'] table_fields = { 'person': 'post_deid_person', 'observation': 'observation', 'concept': 'concept' } for tbl in ['concept']: if not bq_utils.table_exists(tbl, dataset_id=self.dataset_id): create_tables.append(tbl) for tbl in create_tables: bq_utils.create_standard_table(table_fields[tbl], tbl, dataset_id=self.dataset_id, force_all_nullable=True)
def most_common_heel_errors(app_id=None, dataset_id=None, hpo_ids=None): """ :param app_id: Application Id :param dataset_id: Dataset Id :param hpo_ids: list of Hpo_ids :return: None """ heel_errors = list() if app_id is None: app_id = app_identity.get_application_id() if dataset_id is None: dataset_id = bq_utils.get_dataset_id() if not os.path.exists(HEEL_ERRORS_JSON) and not os.path.exists( HEEL_ERRORS_CSV): for hpo_id in hpo_ids: if bq_utils.table_exists( table_id='{hpo_id}_achilles_heel_results'.format( hpo_id=hpo_id), dataset_id=dataset_id): query = heel_error_query.format(app_id=app_id, dataset_id=dataset_id, hpo_id=hpo_id) query_job = bq_utils.query(query) result = bq_utils.response2rows(query_job) heel_errors.extend(result) with open(HEEL_ERRORS_JSON, 'w') as fp: json.dump(heel_errors, fp, sort_keys=True, indent=4) parse_json_csv()
def drop_concept_table(dataset_id): if bq_utils.table_exists(common.CONCEPT): q = "DROP TABLE {dataset}.concept;".format(dataset=dataset_id) try: bq_utils.query(q) except HttpError as err: if err.resp.status != 404: raise
def test_create_standard_table(self): standard_tables = list(resources.CDM_TABLES) + ACHILLES_TABLES for standard_table in standard_tables: table_id = f'prefix_for_test_{standard_table}' result = bq_utils.create_standard_table(standard_table, table_id) self.assertTrue('kind' in result) self.assertEqual(result['kind'], 'bigquery#table') # sanity check self.assertTrue(bq_utils.table_exists(table_id))
def exist_participant_match(ehr_dataset_id, hpo_id): """ This function checks if the hpo has submitted the participant_match data :param ehr_dataset_id: :param hpo_id: :return: """ return bq_utils.table_exists( bq_utils.get_table_id(hpo_id, PARTICIPANT_MATCH), ehr_dataset_id)
def test_validation_creation_and_population(self, mock_table_schema, mock_fields_for): # Preconditions mock_table_schema.return_value = self.schema mock_fields_for.return_value = self.id_match_fields expected = [{ 'person_id': 1, 'first_name': 'missing_ehr', 'last_name': 'missing_ehr', 'algorithm': 'no' }, { 'person_id': 2, 'first_name': 'missing_rdr', 'last_name': 'missing_ehr', 'algorithm': 'no' }, { 'person_id': 3, 'first_name': 'missing_ehr', 'last_name': 'missing_rdr', 'algorithm': 'no' }, { 'person_id': 4, 'first_name': 'missing_rdr', 'last_name': 'missing_rdr', 'algorithm': 'no' }] # Creates validation table if it does not already exist # Will need to be created if this test is ran individually if not bq_utils.table_exists(self.id_match_table_id, self.dataset_id): id_validation.create_drc_validation_table( self.client, self.project_id, self.id_match_table_id, drc_dataset_id=self.dataset_id) # Test validation table population id_validation.populate_validation_table(self.client, self.project_id, self.id_match_table_id, self.hpo_id, drc_dataset_id=self.dataset_id) query_contents = CONTENT_QUERY.render( project_id=self.project_id, drc_dataset_id=self.dataset_id, id_match_table_id=self.id_match_table_id) content_job = self.client.query(query_contents) contents = list(content_job.result()) actual = [dict(row.items()) for row in contents] self.assertCountEqual(actual, expected)
def test_create_table(self): table_id = 'some_random_table_id' fields = [ dict(name='id', type='integer', mode='required'), dict(name='name', type='string', mode='nullable') ] result = bq_utils.create_table(table_id, fields) self.assertTrue('kind' in result) self.assertEqual(result['kind'], 'bigquery#table') # sanity check self.assertTrue(bq_utils.table_exists(table_id))
def create_metadata_table(dataset_id, fields_list): """ Creates a metadata table in a given dataset. :param dataset_id: name of the dataset :param fields_list: name of the dataset :return: """ if not bq_utils.table_exists(METADATA_TABLE, dataset_id): bq_utils.create_table(table_id=METADATA_TABLE, fields=fields_list, dataset_id=dataset_id)
def main(args): hpo_id = args.hpo_id for table_name in common.CDM_TABLES: table_id = hpo_id + '_' + table_name if bq_utils.table_exists(table_id): print table_id, ' exists' else: print table_id, ' being created' bq_utils.create_standard_table(table_name, table_id, False) _run_achilles(hpo_id) _run_export(hpo_id)
def test_create_table_drop_existing_success(self): table_id = 'some_random_table_id' fields = [dict(name='id', type='integer', mode='required'), dict(name='name', type='string', mode='nullable')] result_1 = bq_utils.create_table(table_id, fields) # sanity check table_id = result_1['tableReference']['tableId'] self.assertTrue(bq_utils.table_exists(table_id)) result_2 = bq_utils.create_table(table_id, fields, drop_existing=True) # same id and second one created after first one self.assertEqual(result_1['id'], result_2['id']) self.assertTrue(result_2['creationTime'] > result_1['creationTime'])
def drop_or_truncate_table(command): """ Deletes or truncates table Previously, deletion was used for both truncate and drop, and this function retains the behavior :param command: query to run :return: None """ if sql_wrangle.is_truncate(command): table_id = sql_wrangle.get_truncate_table_name(command) else: table_id = sql_wrangle.get_drop_table_name(command) if bq_utils.table_exists(table_id): bq_utils.delete_table(table_id)
def main(args): hpo_id = args.hpo_id folder = args.folder folder_prefix = folder + '/' for table_name in common.CDM_TABLES: table_id = hpo_id + '_' + table_name if bq_utils.table_exists(table_id): print table_id, ' exists' else: print table_id, ' being created' bq_utils.create_standard_table(table_name, table_id, False) _run_achilles(hpo_id) _run_export(hpo_id, folder_prefix) _upload_achilles_files(hpo_id, folder_prefix)
def main(args): folder = args.folder target_bucket = args.bucket folder_prefix = folder + '/' for table_name in common.CDM_TABLES: table_id = table_name if bq_utils.table_exists(table_id): print table_id, ' exists' else: print table_id, ' being created' bq_utils.create_standard_table(table_name, table_id, False) _run_achilles() _run_export(folder_prefix=folder_prefix, target_bucket=target_bucket) _upload_achilles_files(folder_prefix=folder_prefix, target_bucket=target_bucket)
def test_execute_queries(self): project_id = bq_utils.app_identity.get_application_id() dataset_id = bq_utils.get_combined_dataset_id() sandbox_id = bq_utils.get_unioned_dataset_id() test_util.delete_all_tables(dataset_id) create_tables = ( ['person'] + common.CLINICAL_DATA_TABLES + ['_mapping_' + t for t in common.MAPPED_CLINICAL_DATA_TABLES]) # TODO(calbach): Make the setup/teardown of these concept tables hermetic. for tbl in ['concept', 'concept_ancestor']: if not bq_utils.table_exists(tbl, dataset_id=dataset_id): create_tables.push(tbl) for tbl in create_tables: bq_utils.create_standard_table(tbl, tbl, dataset_id=dataset_id, force_all_nullable=True) for tmpl in INSERT_FAKE_PARTICIPANTS_TMPLS: resp = bq_utils.query( tmpl.render(project_id=project_id, dataset_id=dataset_id, rdr_basics_concept_id=123, rdr_consent_concept_id=345, ehr_obs_concept_id=567, rdr_basics_module_concept_id= drop_participants_without_ppi_or_ehr. BASICS_MODULE_CONCEPT_ID)) self.assertTrue(resp["jobComplete"]) clean_cdr_engine.clean_dataset( project_id, dataset_id, sandbox_id, [(drop_participants_without_ppi_or_ehr.get_queries, )]) def table_to_person_ids(t): rows = bq_utils.response2rows( bq_utils.query("SELECT person_id FROM `{}.{}.{}`".format( project_id, dataset_id, t))) return set([r["person_id"] for r in rows]) # We expect participants 1, 5 to have been removed from all tables. self.assertEqual(set([2, 3, 4, 6]), table_to_person_ids("person")) self.assertEqual(set([2, 4, 6]), table_to_person_ids("observation")) self.assertEquals(set([3, 4]), table_to_person_ids("drug_exposure")) test_util.delete_all_tables(dataset_id)
def _load_dataset(self, hpo_id): for cdm_table in resources.CDM_TABLES: cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_table + '.csv') if os.path.exists(cdm_file_name): test_util.write_cloud_file(self.hpo_bucket, cdm_file_name) else: test_util.write_cloud_str(self.hpo_bucket, cdm_table + '.csv', 'dummy\n') bq_utils.load_cdm_csv(hpo_id, cdm_table) # ensure concept table exists if not bq_utils.table_exists(common.CONCEPT): bq_utils.create_standard_table(common.CONCEPT, common.CONCEPT) q = """INSERT INTO {dataset}.concept SELECT * FROM {vocab}.concept""".format( dataset=self.dataset, vocab=common.VOCABULARY_DATASET) bq_utils.query(q)
def main(): parser = get_arg_parser() args = parser.parse_args() # get credentials and create client impersonation_creds = auth.get_impersonation_credentials( args.run_as_email, SCOPES) client = bq.get_client(args.project_id, credentials=impersonation_creds) table_id = f'{IDENTITY_MATCH_TABLE}_{args.hpo_id}' # Creates hpo_site identity match table if it does not exist if not table_exists(table_id, DRC_OPS): create_drc_validation_table(client, args.project_id, table_id) # Populates the validation table for the site populate_validation_table(client, args.project_id, table_id, args.hpo_id)
def load_deid_map_table(deid_map_dataset_name, age_limit): # Create _deid_map table in input dataset project_id = app_identity.get_application_id() client = bq.get_client(project_id) deid_map_table = f'{project_id}.{deid_map_dataset_name}.{DEID_MAP_TABLE}' # Copy master _deid_map table records to _deid_map table if bq_utils.table_exists(DEID_MAP_TABLE, dataset_id=PIPELINE_TABLES_DATASET): copy_deid_map_table(deid_map_table, project_id, PIPELINE_TABLES_DATASET, deid_map_dataset_name, age_limit, client) logging.info( f"copied participants younger than {age_limit} to the table {deid_map_table}" ) else: raise RuntimeError( f'{DEID_MAP_TABLE} is not available in {project_id}.{PIPELINE_TABLES_DATASET}' )
def _load_dataset(self, hpo_id): for cdm_table in resources.CDM_TABLES: cdm_filename: str = f'{cdm_table}.csv' cdm_filepath: str = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_filename) bucket = self.storage_client.get_bucket(self.hpo_bucket) cdm_blob = bucket.blob(cdm_filename) if os.path.exists(cdm_filepath): cdm_blob.upload_from_filename(cdm_filepath) else: cdm_blob.upload_from_string('dummy\n') bq_utils.load_cdm_csv(hpo_id, cdm_table) # ensure concept table exists if not bq_utils.table_exists(common.CONCEPT): bq_utils.create_standard_table(common.CONCEPT, common.CONCEPT) q = """INSERT INTO {dataset}.concept SELECT * FROM {vocab}.concept""".format( dataset=self.dataset, vocab=common.VOCABULARY_DATASET) bq_utils.query(q)
def _create_drug_class_table(bigquery_dataset_id): table_name = 'drug_class' fields = [{ "type": "integer", "name": "concept_id", "mode": "required" }, { "type": "string", "name": "concept_name", "mode": "required" }, { "type": "string", "name": "drug_class_name", "mode": "required" }] bq_utils.create_table(table_id=table_name, fields=fields, drop_existing=True, dataset_id=bigquery_dataset_id) bq_utils.query(q=main_consts.DRUG_CLASS_QUERY.format( dataset_id=bigquery_dataset_id), use_legacy_sql=False, destination_table_id='drug_class', retry_count=bq_consts.BQ_DEFAULT_RETRY_COUNT, write_disposition='WRITE_TRUNCATE', destination_dataset_id=bigquery_dataset_id) # ensure concept ancestor table exists if not bq_utils.table_exists(common.CONCEPT_ANCESTOR): bq_utils.create_standard_table(common.CONCEPT_ANCESTOR, common.CONCEPT_ANCESTOR) q = """INSERT INTO {dataset}.concept_ancestor SELECT * FROM {vocab}.concept_ancestor""".format( dataset=bigquery_dataset_id, vocab=common.VOCABULARY_DATASET) bq_utils.query(q)
def copy_vocabulary_tables(input_dataset, dest_dataset): for table in VOCABULARY_TABLES: if bq_utils.table_exists(table, dataset_id=input_dataset): pass