def test_merge_with_good_data(self): running_jobs = [] with open(NYC_FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket('nyc'), 'person.csv', fp) result = bq_utils.load_cdm_csv('nyc', 'person') running_jobs.append(result['jobReference']['jobId']) with open(PITT_FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket('pitt'), 'person.csv', fp) result = bq_utils.load_cdm_csv('pitt', 'person') running_jobs.append(result['jobReference']['jobId']) nyc_person_ids = [ int(row['person_id']) for row in resources.csv_to_list(NYC_FIVE_PERSONS_PERSON_CSV) ] pitt_person_ids = [ int(row['person_id']) for row in resources.csv_to_list(PITT_FIVE_PERSONS_PERSON_CSV) ] expected_result = nyc_person_ids + pitt_person_ids expected_result.sort() incomplete_jobs = bq_utils.wait_on_jobs(running_jobs) self.assertEqual( len(incomplete_jobs), 0, 'loading tables {},{} timed out'.format('nyc_person', 'pitt_person')) dataset_id = self.dataset_id table_ids = ['nyc_person', 'pitt_person'] merged_table_id = 'merged_nyc_pitt' success_flag, error = bq_utils.merge_tables(dataset_id, table_ids, dataset_id, merged_table_id) self.assertTrue(success_flag) self.assertEqual(error, "") query_string = 'SELECT person_id FROM {dataset_id}.{table_id}'.format( dataset_id=dataset_id, table_id=merged_table_id) merged_query_job_result = bq_utils.query(query_string) self.assertIsNone(merged_query_job_result.get('errors', None)) actual_result = [ int(row['f'][0]['v']) for row in merged_query_job_result['rows'] ] actual_result.sort() self.assertCountEqual(expected_result, actual_result)
def test_pii_files_loaded(self, mock_check_cron): # tests if pii files are loaded test_file_paths = [ test_util.PII_NAME_FILE, test_util.PII_MRN_BAD_PERSON_ID_FILE ] test_file_names = [os.path.basename(f) for f in test_file_paths] test_util.write_cloud_file(self.hpo_bucket, test_util.PII_NAME_FILE, prefix=self.folder_prefix) test_util.write_cloud_file(self.hpo_bucket, test_util.PII_MRN_BAD_PERSON_ID_FILE, prefix=self.folder_prefix) rs = resources.csv_to_list(test_util.PII_FILE_LOAD_RESULT_CSV) expected_results = [(r['file_name'], int(r['found']), int(r['parsed']), int(r['loaded'])) for r in rs] for f in common.SUBMISSION_FILES: if f not in test_file_names: expected_result = (f, 0, 0, 0) expected_results.append(expected_result) bucket_items = gcs_utils.list_bucket(self.hpo_bucket) folder_items = main.get_folder_items(bucket_items, self.folder_prefix) r = main.validate_submission(self.hpo_id, self.hpo_bucket, folder_items, self.folder_prefix) self.assertSetEqual(set(expected_results), set(r['results']))
def test_load_ehr_observation(self): hpo_id = 'pitt' dataset_id = self.dataset_id table_id = bq_utils.get_table_id(hpo_id, table_name='observation') q = 'SELECT observation_id FROM {dataset_id}.{table_id} ORDER BY observation_id'.format( dataset_id=dataset_id, table_id=table_id) expected_observation_ids = [ int(row['observation_id']) for row in resources.csv_to_list(PITT_FIVE_PERSONS_OBSERVATION_CSV) ] sc_bucket = self.client.get_bucket(gcs_utils.get_hpo_bucket(hpo_id)) bucket_blob = sc_bucket.blob('observation.csv') with open(PITT_FIVE_PERSONS_OBSERVATION_CSV, 'rb') as fp: bucket_blob.upload_from_file(fp) result = bq_utils.load_cdm_csv(hpo_id, 'observation') job_id = result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([job_id]) self.assertEqual(len(incomplete_jobs), 0, 'pitt_observation load job did not complete') load_job_result = bq_utils.get_job_details(job_id) load_job_result_status = load_job_result['status'] load_job_errors = load_job_result_status.get('errors') self.assertIsNone(load_job_errors, msg='pitt_observation load job failed: ' + str(load_job_errors)) query_results_response = bq_utils.query(q) query_job_errors = query_results_response.get('errors') self.assertIsNone(query_job_errors) actual_result = [ int(row['f'][0]['v']) for row in query_results_response['rows'] ] self.assertCountEqual(actual_result, expected_observation_ids)
def test_create_dose_form_route_mappings_table_with_dataset_id( self, mock_query, mock_create_table): # pre conditions route_mappings_csv = os.path.join( resources.resource_files_path, populate_route_ids.DOSE_FORM_ROUTES_FILE + ".csv") dose_form_route_mappings = resources.csv_to_list(route_mappings_csv) mapping_list = populate_route_ids.get_mapping_list( dose_form_route_mappings) query_params = dict( project_id=self.project_id, dataset_id=self.dataset_id, routes_table_id=populate_route_ids.DOSE_FORM_ROUTES_TABLE_ID, mapping_list=mapping_list) expected_query = populate_route_ids.INSERT_ROUTES_QUERY.format( **query_params) # test populate_route_ids.create_dose_form_route_mappings_table( self.project_id, self.dataset_id) # post conditions mock_query.assert_called_with(expected_query) mock_create_table.assert_called_with( populate_route_ids.DOSE_FORM_ROUTES_TABLE_ID, populate_route_ids.DOSE_FORM_ROUTE_FIELDS, drop_existing=True, dataset_id=self.dataset_id)
def create_dose_form_route_mappings_table(project_id, dataset_id=None): """ Creates "_logging_dose_form_route_mappings" table with only id columns from resources/dose_form_route_mappings.csv :param project_id: :param dataset_id: BQ dataset_id :return: upload metadata for created table """ if dataset_id is None: # Using table created in bq_dataset instead of re-creating in every dataset dataset_id = bq_utils.get_dataset_id() dose_form_routes_table_id = DOSE_FORM_ROUTES_TABLE_ID LOGGER.info("Creating %s.%s", dataset_id, DOSE_FORM_ROUTES_TABLE_ID) # create empty table bq_utils.create_table(DOSE_FORM_ROUTES_TABLE_ID, DOSE_FORM_ROUTE_FIELDS, drop_existing=True, dataset_id=dataset_id) dose_form_route_mappings_csv = os.path.join(resources.resource_path, DOSE_FORM_ROUTES_FILE + ".csv") dose_form_route_mappings_list = resources.csv_to_list( dose_form_route_mappings_csv) dose_form_routes_populate_query = INSERT_ROUTES_QUERY.format( dataset_id=dataset_id, project_id=project_id, routes_table_id=DOSE_FORM_ROUTES_TABLE_ID, mapping_list=get_mapping_list(dose_form_route_mappings_list)) result = bq_utils.query(dose_form_routes_populate_query) LOGGER.info("Created %s.%s", dataset_id, dose_form_routes_table_id) return result
def test_measurement_concept_sets_table(self): query = sql_wrangle.qualify_tables( '''SELECT * FROM {dataset_id}.{table_id}'''.format( dataset_id=self.dataset_id, table_id=MEASUREMENT_CONCEPT_SETS_TABLE)) response = bq_utils.query(query) actual_fields = [{ 'name': field['name'].lower(), 'type': field['type'].lower() } for field in response['schema']['fields']] expected_fields = [{ 'name': field['name'].lower(), 'type': field['type'].lower() } for field in resources.fields_for(MEASUREMENT_CONCEPT_SETS_TABLE)] self.assertListEqual(expected_fields, actual_fields) measurement_concept_sets_table_path = os.path.join( resources.resource_path, MEASUREMENT_CONCEPT_SETS_TABLE + '.csv') expected_total_rows = len( resources.csv_to_list(measurement_concept_sets_table_path)) self.assertEqual(expected_total_rows, int(response['totalRows']))
def load_test_data(self, hpo_id: str = None): """ Load to bq test achilles heel results data from csv file :param hpo_id: if specified, prefix to use on csv test file and bq table, otherwise no prefix is used :return: contents of the file as list of objects """ table_name: str = common.ACHILLES_HEEL_RESULTS if hpo_id is not None: table_id: str = bq_utils.get_table_id(hpo_id, table_name) else: table_id: str = table_name test_file_name: str = f'{table_id}.csv' test_file_path: str = os.path.join(test_util.TEST_DATA_PATH, test_file_name) target_bucket = self.storage_client.get_bucket(self.bucket) test_blob = target_bucket.blob(test_file_name) test_blob.upload_from_filename(test_file_path) gcs_path: str = f'gs://{self.bucket}/{test_file_name}' load_results = bq_utils.load_csv(table_name, gcs_path, self.project_id, self.dataset_id, table_id) job_id = load_results['jobReference']['jobId'] bq_utils.wait_on_jobs([job_id]) return resources.csv_to_list(test_file_path)
def _load_datasets(self): """ Load five persons data for nyc and pitt test hpo and rdr data for the excluded_hpo # expected_tables is for testing output # it maps table name to list of expected records ex: "unioned_ehr_visit_occurrence" -> [{}, {}, ...] """ expected_tables: dict = {} running_jobs: list = [] for cdm_table in resources.CDM_TABLES: output_table: str = ehr_union.output_table_for(cdm_table) expected_tables[output_table] = [] for hpo_id in self.hpo_ids: # upload csv into hpo bucket cdm_filename: str = f'{cdm_table}.csv' if hpo_id == NYC_HPO_ID: cdm_filepath: str = os.path.join( test_util.FIVE_PERSONS_PATH, cdm_filename) elif hpo_id == PITT_HPO_ID: cdm_filepath: str = os.path.join( test_util.PITT_FIVE_PERSONS_PATH, cdm_filename) elif hpo_id == EXCLUDED_HPO_ID: if cdm_table in [ 'observation', 'person', 'visit_occurrence' ]: cdm_filepath: str = os.path.join( test_util.RDR_PATH, cdm_filename) bucket: str = gcs_utils.get_hpo_bucket(hpo_id) gcs_bucket = self.storage_client.get_bucket(bucket) if os.path.exists(cdm_filepath): csv_rows = resources.csv_to_list(cdm_filepath) cdm_blob = gcs_bucket.blob(cdm_filename) cdm_blob.upload_from_filename(cdm_filepath) else: # results in empty table cdm_blob = gcs_bucket.blob(cdm_filename) cdm_blob.upload_from_string('dummy\n') csv_rows: list = [] # load table from csv result = bq_utils.load_cdm_csv(hpo_id, cdm_table) running_jobs.append(result['jobReference']['jobId']) if hpo_id != EXCLUDED_HPO_ID: expected_tables[output_table] += list(csv_rows) # ensure person to observation output is as expected output_table_person: str = ehr_union.output_table_for(common.PERSON) output_table_observation: str = ehr_union.output_table_for( common.OBSERVATION) expected_tables[output_table_observation] += 4 * expected_tables[ output_table_person] incomplete_jobs: list = bq_utils.wait_on_jobs(running_jobs) if len(incomplete_jobs) > 0: message: str = "Job id(s) %s failed to complete" % incomplete_jobs raise RuntimeError(message) self.expected_tables = expected_tables
def get_nyc_cu_cols(): result = [] cols = resources.csv_to_list(test_util.TEST_NYC_CU_COLS_CSV) for col in cols: omop_table_name = completeness.get_standard_table_name( col[consts.TABLE_NAME]) if omop_table_name: col[consts.OMOP_TABLE_NAME] = omop_table_name result.append(col) return result
def _load_datasets(self): """ Load five persons data for each test hpo # expected_tables is for testing output # it maps table name to list of expected records ex: "unioned_ehr_visit_occurrence" -> [{}, {}, ...] """ expected_tables = dict() running_jobs = [] for cdm_table in resources.CDM_TABLES: output_table = ehr_union.output_table_for(cdm_table) expected_tables[output_table] = [] for hpo_id in self.hpo_ids: # upload csv into hpo bucket if hpo_id == NYC_HPO_ID: cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_table + '.csv') else: cdm_file_name = os.path.join( test_util.PITT_FIVE_PERSONS_PATH, cdm_table + '.csv') bucket = gcs_utils.get_hpo_bucket(hpo_id) if os.path.exists(cdm_file_name): test_util.write_cloud_file(bucket, cdm_file_name) csv_rows = resources.csv_to_list(cdm_file_name) else: # results in empty table test_util.write_cloud_str(bucket, cdm_table + '.csv', 'dummy\n') csv_rows = [] # load table from csv result = bq_utils.load_cdm_csv(hpo_id, cdm_table) running_jobs.append(result['jobReference']['jobId']) expected_tables[output_table] += list(csv_rows) # ensure person to observation output is as expected output_table_person = ehr_union.output_table_for( combine_ehr_rdr.PERSON_TABLE) output_table_observation = ehr_union.output_table_for( combine_ehr_rdr.OBSERVATION_TABLE) expected_tables[output_table_observation] += 4 * expected_tables[ output_table_person] incomplete_jobs = bq_utils.wait_on_jobs(running_jobs) if len(incomplete_jobs) > 0: message = "Job id(s) %s failed to complete" % incomplete_jobs raise RuntimeError(message) self.expected_tables = expected_tables
def load_table_from_csv(project_id, dataset_id, table_name, csv_path=None, fields=None): """ Loads BQ table from a csv file without making use of GCS buckets :param project_id: project containing the dataset :param dataset_id: dataset where the table needs to be created :param table_name: name of the table to be created :param csv_path: path to the csv file which needs to be loaded into BQ. If None, assumes that the file exists in the resource_files folder with the name table_name.csv :param fields: fields in list of dicts format. If set to None, assumes that the fields are stored in a json file in resource_files/fields named table_name.json :return: BQ response for the load query """ if csv_path is None: csv_path = os.path.join(resources.resource_files_path, table_name + ".csv") table_list = resources.csv_to_list(csv_path) if fields is None: fields_filename = os.path.join(resources.fields_path, table_name + '.json') with open(fields_filename, 'r') as f: fields = json.load(f) field_names = ', '.join([field['name'] for field in fields]) row_exprs = [csv_line_to_sql_row_expr(t, fields) for t in table_list] formatted_mapping_list = ', '.join(row_exprs) create_table(table_id=table_name, fields=fields, drop_existing=True, dataset_id=dataset_id) table_populate_query = bq_consts.INSERT_QUERY.format( project_id=project_id, dataset_id=dataset_id, table_id=table_name, columns=field_names, mapping_list=formatted_mapping_list) result = query(table_populate_query) return result
def load_test_data(self, hpo_id=None): """ Load to bq test achilles heel results data from csv file :param hpo_id: if specified, prefix to use on csv test file and bq table, otherwise no prefix is used :return: contents of the file as list of objects """ table_name = common.ACHILLES_HEEL_RESULTS if hpo_id is not None: table_id = bq_utils.get_table_id(hpo_id, table_name) else: table_id = table_name test_file_name = table_id + '.csv' test_file_path = os.path.join(test_util.TEST_DATA_PATH, test_file_name) test_util.write_cloud_file(self.bucket, test_file_path) gcs_path = 'gs://' + self.bucket + '/' + test_file_name load_results = bq_utils.load_csv(table_name, gcs_path, self.app_id, self.dataset_id, table_id) job_id = load_results['jobReference']['jobId'] bq_utils.wait_on_jobs([job_id]) return resources.csv_to_list(test_file_path)
def create_unit_mapping_table(project_id, dataset_id): """ This function creates the unit_mapping table and populate it with the values from resources/unit_mapping.csv :param project_id: :param dataset_id: :return: """ bq_utils.create_table(table_id=UNIT_MAPPING_TABLE, fields=UNIT_MAPPING_FIELDS, drop_existing=True, dataset_id=dataset_id) unit_mappings_csv = os.path.join(resources.resource_path, UNIT_MAPPING_FILE + ".csv") unit_mappings_list = resources.csv_to_list(unit_mappings_csv) unit_mappings_populate_query = INSERT_UNITS_QUERY.format( dataset_id=dataset_id, project_id=project_id, units_table_id=UNIT_MAPPING_TABLE, mapping_list=get_mapping_list(unit_mappings_list)) result = bq_utils.query(unit_mappings_populate_query) LOGGER.info("Created %s.%s", dataset_id, UNIT_MAPPING_TABLE) return result