def _load_data(self): # Load measurement_concept_sets required_labs.load_measurement_concept_sets_table( project_id=self.project_id, dataset_id=self.dataset_id) # Load measurement_concept_sets_descendants required_labs.load_measurement_concept_sets_descendants_table( project_id=self.project_id, dataset_id=self.dataset_id) # Create concept and concept_ancestor empty tables if not exist if not bq_utils.table_exists(common.CONCEPT, self.dataset_id): bq_utils.create_standard_table(common.CONCEPT, common.CONCEPT) if not bq_utils.table_exists(common.CONCEPT, self.dataset_id): bq_utils.create_standard_table(common.CONCEPT_ANCESTOR, common.CONCEPT_ANCESTOR) # we need to load measurement.csv into bigquery_dataset_id in advance for the other integration tests ehr_measurement_result = bq_utils.load_table_from_csv( project_id=self.project_id, dataset_id=self.dataset_id, table_name=bq_utils.get_table_id(FAKE_HPO_ID, common.MEASUREMENT), csv_path=test_util.FIVE_PERSONS_MEASUREMENT_CSV, fields=resources.fields_for(common.MEASUREMENT)) bq_utils.wait_on_jobs( [ehr_measurement_result['jobReference']['jobId']])
def populate_achilles(hpo_bucket, hpo_id=FAKE_HPO_ID, include_heel=True): from validation import achilles, achilles_heel import app_identity app_id = app_identity.get_application_id() test_file_name = achilles.ACHILLES_ANALYSIS + '.csv' achilles_analysis_file_path = os.path.join(TEST_DATA_EXPORT_PATH, test_file_name) schema_name = achilles.ACHILLES_ANALYSIS write_cloud_file(hpo_bucket, achilles_analysis_file_path) gcs_path = 'gs://' + hpo_bucket + '/' + test_file_name dataset_id = bq_utils.get_dataset_id() table_id = bq_utils.get_table_id(hpo_id, achilles.ACHILLES_ANALYSIS) bq_utils.load_csv(schema_name, gcs_path, app_id, dataset_id, table_id) table_names = [achilles.ACHILLES_RESULTS, achilles.ACHILLES_RESULTS_DIST] if include_heel: table_names.append(achilles_heel.ACHILLES_HEEL_RESULTS) running_jobs = [] for table_name in table_names: test_file_name = table_name + '.csv' test_file_path = os.path.join(TEST_DATA_EXPORT_SYNPUF_PATH, table_name + '.csv') write_cloud_file(hpo_bucket, test_file_path) gcs_path = 'gs://' + hpo_bucket + '/' + test_file_name dataset_id = bq_utils.get_dataset_id() table_id = bq_utils.get_table_id(hpo_id, table_name) load_results = bq_utils.load_csv(table_name, gcs_path, app_id, dataset_id, table_id) running_jobs.append(load_results['jobReference']['jobId']) bq_utils.wait_on_jobs(running_jobs)
def load_test_data(self, hpo_id: str = None): """ Load to bq test achilles heel results data from csv file :param hpo_id: if specified, prefix to use on csv test file and bq table, otherwise no prefix is used :return: contents of the file as list of objects """ table_name: str = common.ACHILLES_HEEL_RESULTS if hpo_id is not None: table_id: str = bq_utils.get_table_id(hpo_id, table_name) else: table_id: str = table_name test_file_name: str = f'{table_id}.csv' test_file_path: str = os.path.join(test_util.TEST_DATA_PATH, test_file_name) target_bucket = self.storage_client.get_bucket(self.bucket) test_blob = target_bucket.blob(test_file_name) test_blob.upload_from_filename(test_file_path) gcs_path: str = f'gs://{self.bucket}/{test_file_name}' load_results = bq_utils.load_csv(table_name, gcs_path, self.project_id, self.dataset_id, table_id) job_id = load_results['jobReference']['jobId'] bq_utils.wait_on_jobs([job_id]) return resources.csv_to_list(test_file_path)
def load_dataset_from_files(dataset_id, path): app_id = bq_utils.app_identity.get_application_id() bucket = gcs_utils.get_hpo_bucket(test_util.FAKE_HPO_ID) test_util.empty_bucket(bucket) job_ids = [] for table in common.CDM_TABLES: filename = table + '.csv' schema = os.path.join(resources.fields_path, table + '.json') f = os.path.join(path, filename) if os.path.exists(os.path.join(path, filename)): with open(f, 'r') as fp: gcs_utils.upload_object(bucket, filename, fp) else: test_util.write_cloud_str(bucket, filename, '\n') gcs_path = 'gs://{bucket}/{filename}'.format(bucket=bucket, filename=filename) load_results = bq_utils.load_csv(schema, gcs_path, app_id, dataset_id, table, allow_jagged_rows=True) load_job_id = load_results['jobReference']['jobId'] job_ids.append(load_job_id) incomplete_jobs = bq_utils.wait_on_jobs(job_ids) if len(incomplete_jobs) > 0: message = "Job id(s) %s failed to complete" % incomplete_jobs raise RuntimeError(message) test_util.empty_bucket(bucket)
def run_heel(hpo_id): # very long test commands = _get_heel_commands(hpo_id) count = 0 for command in commands: count = count + 1 logging.debug(' ---- running query # {}'.format(count)) logging.debug(' ---- Running `%s`...\n' % command) if sql_wrangle.is_to_temp_table(command): table_id = sql_wrangle.get_temp_table_name(command) query = sql_wrangle.get_temp_table_query(command) insert_query_job_result = bq_utils.query(query, False, table_id) query_job_id = insert_query_job_result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([query_job_id]) if len(incomplete_jobs) > 0: logging.critical('tempresults doesnt get created in 30 secs') raise RuntimeError('Tempresults taking too long to create') elif sql_wrangle.is_truncate(command): table_id = sql_wrangle.get_truncate_table_name(command) query = 'DELETE FROM %s WHERE TRUE' % table_id bq_utils.query(query) elif sql_wrangle.is_drop(command): table_id = sql_wrangle.get_drop_table_name(command) bq_utils.delete_table(table_id) else: bq_utils.query(command) time.sleep(1)
def _load_datasets(self): """ Load five persons data for each test hpo """ # expected_tables is for testing output # it maps table name to list of expected records ex: "unioned_ehr_visit_occurrence" -> [{}, {}, ...] expected_tables = dict() running_jobs = [] for cdm_table in common.CDM_TABLES: cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_table + '.csv') output_table = ehr_union.output_table_for(cdm_table) expected_tables[output_table] = [] for hpo_id in self.hpo_ids: # upload csv into hpo bucket bucket = gcs_utils.get_hpo_bucket(hpo_id) if os.path.exists(cdm_file_name): test_util.write_cloud_file(bucket, cdm_file_name) csv_rows = resources._csv_to_list(cdm_file_name) else: # results in empty table test_util.write_cloud_str(bucket, cdm_table + '.csv', 'dummy\n') csv_rows = [] # load table from csv result = bq_utils.load_cdm_csv(hpo_id, cdm_table) running_jobs.append(result['jobReference']['jobId']) expected_tables[output_table] += list(csv_rows) incomplete_jobs = bq_utils.wait_on_jobs(running_jobs) if len(incomplete_jobs) > 0: message = "Job id(s) %s failed to complete" % incomplete_jobs raise RuntimeError(message) self.expected_tables = expected_tables
def _load_datasets(self): load_jobs = [] self.expected_tables = dict() for cdm_table in common.CDM_TABLES: cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_table + '.csv') result_table = ehr_merge.result_table_for(cdm_table) if os.path.exists(cdm_file_name): # one copy for chs, the other for pitt csv_rows = resources._csv_to_list(cdm_file_name) self.expected_tables[result_table] = csv_rows + list(csv_rows) test_util.write_cloud_file(self.chs_bucket, cdm_file_name) test_util.write_cloud_file(self.pitt_bucket, cdm_file_name) else: self.expected_tables[result_table] = [] test_util.write_cloud_str(self.chs_bucket, cdm_table + '.csv', 'dummy\n') test_util.write_cloud_str(self.pitt_bucket, cdm_table + '.csv', 'dummy\n') chs_load_results = bq_utils.load_cdm_csv(CHS_HPO_ID, cdm_table) pitt_load_results = bq_utils.load_cdm_csv(PITT_HPO_ID, cdm_table) chs_load_job_id = chs_load_results['jobReference']['jobId'] pitt_load_job_id = pitt_load_results['jobReference']['jobId'] load_jobs.append(chs_load_job_id) load_jobs.append(pitt_load_job_id) incomplete_jobs = bq_utils.wait_on_jobs(load_jobs) if len(incomplete_jobs) > 0: raise RuntimeError('BigQuery jobs %s failed to complete' % incomplete_jobs)
def query(q, dst_table_id, dst_dataset_id, write_disposition='WRITE_APPEND'): """ Run query and save results to a table :param q: SQL statement :param dst_table_id: save results in a table with the specified id :param dst_dataset_id: identifies output dataset :param write_disposition: WRITE_TRUNCATE, WRITE_EMPTY, or WRITE_APPEND (default, to preserve schema) :return: query result """ query_job_result = bq_utils.query(q, destination_table_id=dst_table_id, destination_dataset_id=dst_dataset_id, write_disposition=write_disposition) query_job_id = query_job_result['jobReference']['jobId'] logging.info(f'Job {query_job_id} started for table {dst_table_id}') job_status = query_job_result['status'] error_result = job_status.get('errorResult') if error_result is not None: msg = f'Job {query_job_id} failed because: {error_result}' raise bq_utils.InvalidOperationError(msg) incomplete_jobs = bq_utils.wait_on_jobs([query_job_id]) if len(incomplete_jobs) > 0: raise bq_utils.BigQueryJobWaitError(incomplete_jobs) return query_job_result
def test_load_csv(self): from google.appengine.api import app_identity app_id = app_identity.get_application_id() table_name = 'achilles_analysis' schema_file_name = table_name + '.json' csv_file_name = table_name + '.csv' schema_path = os.path.join(resources.fields_path, schema_file_name) local_csv_path = os.path.join(test_util.TEST_DATA_EXPORT_PATH, csv_file_name) with open(local_csv_path, 'r') as fp: response = gcs_utils.upload_object(self.hpo_bucket, csv_file_name, fp) hpo_bucket = self.hpo_bucket gcs_object_path = 'gs://%(hpo_bucket)s/%(csv_file_name)s' % locals() dataset_id = bq_utils.get_dataset_id() load_results = bq_utils.load_csv(schema_path, gcs_object_path, app_id, dataset_id, table_name) load_job_id = load_results['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) self.assertEqual(len(incomplete_jobs), 0, 'loading table {} timed out'.format(table_name)) query_response = bq_utils.query('SELECT COUNT(1) FROM %(table_name)s' % locals()) self.assertEqual(query_response['kind'], 'bigquery#queryResponse')
def run_heel(hpo_id): """ Run heel commands :param hpo_id: string name for the hpo identifier :returns: None :raises RuntimeError: Raised if BigQuery takes longer than 30 seconds to complete a job on a temporary table """ # very long test commands = _get_heel_commands(hpo_id) count = 0 for command in commands: count = count + 1 logging.debug(' ---- running query # {}'.format(count)) logging.debug(' ---- Running `%s`...\n' % command) if sql_wrangle.is_to_temp_table(command): table_id = sql_wrangle.get_temp_table_name(command) query = sql_wrangle.get_temp_table_query(command) insert_query_job_result = bq_utils.query(query, False, table_id) query_job_id = insert_query_job_result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([query_job_id]) if len(incomplete_jobs) > 0: logging.critical('tempresults doesnt get created in 30 secs') raise RuntimeError('Tempresults taking too long to create') elif sql_wrangle.is_truncate(command): table_id = sql_wrangle.get_truncate_table_name(command) query = 'DELETE FROM %s WHERE TRUE' % table_id bq_utils.query(query) elif sql_wrangle.is_drop(command): table_id = sql_wrangle.get_drop_table_name(command) bq_utils.delete_table(table_id) else: bq_utils.query(command)
def create_mapping_table(hpos_with_visit, project_id, dataset_id): """ creates the visit mapping table :hpos_with_visit: hpos that should be including the visit occurrence table :project_id: project with the dataset :dataset_id: dataset with the tables :returns: string if visit table failed; otherwise none """ # list of hpos with visit table and creating visit id mapping table queries visit_hpo_queries = [] for hpo in hpos_with_visit: visit_hpo_queries.append(VISIT_ID_HPO_BLOCK % locals()) union_all_blocks = '\n UNION ALL'.join(visit_hpo_queries) visit_mapping_query = VIST_ID_MAPPING_QUERY_SKELETON % locals() logging.info('Loading ' + VISIT_ID_MAPPING_TABLE) query_result = query(visit_mapping_query, destination_table_id=VISIT_ID_MAPPING_TABLE, write_disposition='WRITE_TRUNCATE') visit_mapping_query_job_id = query_result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([visit_mapping_query_job_id]) if len(incomplete_jobs) == 0: query_result = bq_utils.get_job_details(visit_mapping_query_job_id) if 'errors' in query_result['status']: errors = query_result['status']['errors'] message = 'Failed to load %s due to the following error(s): %s' % ( VISIT_ID_MAPPING_TABLE, errors) logging.error(message) raise RuntimeError(message) else: message = 'Failed to load %s. Job with ID %s never completed.' % ( VISIT_ID_MAPPING_TABLE, visit_mapping_query_job_id) logging.error(message) raise RuntimeError(message)
def create_match_values_table(project, rdr_dataset, destination_dataset): """ Get the desired matching values from the combined observation table. This retrieves all possible matching values from the observation table used in participant matching. Returned query data is limited to the person_id, observation_concept_id, and the value_as_string fields. :param project: The project name to query :param rdr_dataset: The rdr dataset name to query :param destination_dataset: The dataset to write the result table to :return: The name of the interim table """ query_string = consts.ALL_PPI_OBSERVATION_VALUES.format( project=project, dataset=rdr_dataset, table=consts.OBSERVATION_TABLE, pii_list=','.join(consts.PII_CODES_LIST)) LOGGER.info("Participant validation ran the query\n%s", query_string) results = bq_utils.query(query_string, destination_dataset_id=destination_dataset, destination_table_id=consts.ID_MATCH_TABLE, write_disposition='WRITE_TRUNCATE', batch=True) query_job_id = results['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([query_job_id]) if incomplete_jobs != []: raise bq_utils.BigQueryJobWaitError(incomplete_jobs) return consts.ID_MATCH_TABLE
def test_load_ehr_observation(self): observation_query = "SELECT observation_id FROM {}.{} ORDER BY observation_id" hpo_id = 'pitt' expected_observation_ids = [ int(row['observation_id']) for row in resources._csv_to_list( PITT_FIVE_PERSONS_OBSERVATION_CSV) ] with open(PITT_FIVE_PERSONS_OBSERVATION_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket(hpo_id), 'observation.csv', fp) result = bq_utils.load_cdm_csv(hpo_id, 'observation') job_id = result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([job_id]) self.assertEqual(len(incomplete_jobs), 0, 'pitt_observation load job did not complete') load_job_result = bq_utils.get_job_details(job_id) load_job_result_status = load_job_result['status'] load_job_errors = load_job_result_status.get('errors') self.assertIsNone(load_job_errors, msg='pitt_observation load job failed: ' + str(load_job_errors)) query_string = observation_query.format(bq_utils.get_dataset_id(), 'pitt_observation') query_results_response = bq_utils.query_table(query_string) query_job_errors = query_results_response.get('errors') self.assertIsNone(query_job_errors) actual_result = [ int(row['f'][0]['v']) for row in query_results_response['rows'] ] self.assertListEqual(actual_result, expected_observation_ids)
def retraction_query_runner(queries): query_job_ids = [] for query_dict in queries: logger.debug('Retracting from %s.%s using query %s' % (query_dict[DEST_DATASET], query_dict[DEST_TABLE], query_dict[QUERY])) if query_dict[DELETE_FLAG]: job_results = bq_utils.query(q=query_dict[QUERY], batch=True) rows_affected = job_results['numDmlAffectedRows'] logger.debug('%s rows deleted from %s.%s' % (rows_affected, query_dict[DEST_DATASET], query_dict[DEST_TABLE])) else: job_results = bq_utils.query( q=query_dict[QUERY], destination_table_id=query_dict[DEST_TABLE], write_disposition=WRITE_TRUNCATE, destination_dataset_id=query_dict[DEST_DATASET], batch=True) query_job_id = job_results['jobReference']['jobId'] query_job_ids.append(query_job_id) incomplete_jobs = bq_utils.wait_on_jobs(query_job_ids) if incomplete_jobs: logger.debug('Failed on {count} job ids {ids}'.format( count=len(incomplete_jobs), ids=incomplete_jobs)) logger.debug('Terminating retraction') raise bq_utils.BigQueryJobWaitError(incomplete_jobs)
def test_load_ehr_observation(self): hpo_id = 'pitt' dataset_id = self.dataset_id table_id = bq_utils.get_table_id(hpo_id, table_name='observation') q = 'SELECT observation_id FROM {dataset_id}.{table_id} ORDER BY observation_id'.format( dataset_id=dataset_id, table_id=table_id) expected_observation_ids = [ int(row['observation_id']) for row in resources.csv_to_list(PITT_FIVE_PERSONS_OBSERVATION_CSV) ] sc_bucket = self.client.get_bucket(gcs_utils.get_hpo_bucket(hpo_id)) bucket_blob = sc_bucket.blob('observation.csv') with open(PITT_FIVE_PERSONS_OBSERVATION_CSV, 'rb') as fp: bucket_blob.upload_from_file(fp) result = bq_utils.load_cdm_csv(hpo_id, 'observation') job_id = result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([job_id]) self.assertEqual(len(incomplete_jobs), 0, 'pitt_observation load job did not complete') load_job_result = bq_utils.get_job_details(job_id) load_job_result_status = load_job_result['status'] load_job_errors = load_job_result_status.get('errors') self.assertIsNone(load_job_errors, msg='pitt_observation load job failed: ' + str(load_job_errors)) query_results_response = bq_utils.query(q) query_job_errors = query_results_response.get('errors') self.assertIsNone(query_job_errors) actual_result = [ int(row['f'][0]['v']) for row in query_results_response['rows'] ] self.assertCountEqual(actual_result, expected_observation_ids)
def run_analyses(hpo_id): """ Run the achilles analyses :param hpo_id: :return: """ commands = _get_run_analysis_commands(hpo_id) for command in commands: logging.debug(' ---- Running `%s`...\n' % command) if sql_wrangle.is_to_temp_table(command): table_id = sql_wrangle.get_temp_table_name(command) query = sql_wrangle.get_temp_table_query(command) insert_query_job_result = bq_utils.query(query, False, table_id) query_job_id = insert_query_job_result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([query_job_id]) if len(incomplete_jobs) > 0: logging.critical('tempresults doesnt get created in 15 secs') raise RuntimeError('Tempresults taking too long to create') elif sql_wrangle.is_truncate(command): table_id = sql_wrangle.get_truncate_table_name(command) if bq_utils.table_exists(table_id): bq_utils.delete_table(table_id) elif sql_wrangle.is_drop(command): table_id = sql_wrangle.get_drop_table_name(command) if bq_utils.table_exists(table_id): bq_utils.delete_table(table_id) else: bq_utils.query(command)
def test_merge_with_unmatched_schema(self): running_jobs = [] with open(NYC_FIVE_PERSONS_MEASUREMENT_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket('nyc'), 'measurement.csv', fp) result = bq_utils.load_cdm_csv('nyc', 'measurement') running_jobs.append(result['jobReference']['jobId']) with open(PITT_FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket('pitt'), 'person.csv', fp) result = bq_utils.load_cdm_csv('pitt', 'person') running_jobs.append(result['jobReference']['jobId']) incomplete_jobs = bq_utils.wait_on_jobs(running_jobs) self.assertEqual( len(incomplete_jobs), 0, 'loading tables {},{} timed out'.format('nyc_measurement', 'pitt_person')) table_names = ['nyc_measurement', 'pitt_person'] success, error = bq_utils.merge_tables(bq_utils.get_dataset_id(), table_names, bq_utils.get_dataset_id(), 'merged_nyc_pitt') self.assertFalse(success)
def merge(dataset_id, project_id): """merge hpo ehr data :dataset_id: source and target dataset :project_id: project in which everything happens :returns: list of tables generated successfully """ logging.info('Starting merge') existing_tables = bq_utils.list_dataset_contents(dataset_id) hpos_to_merge = [] hpos_with_visit = [] for item in resources.hpo_csv(): hpo_id = item['hpo_id'] if hpo_id + '_person' in existing_tables: hpos_to_merge.append(hpo_id) if hpo_id + '_visit_occurrence' in existing_tables: hpos_with_visit.append(hpo_id) logging.info('HPOs to merge: %s' % hpos_to_merge) logging.info('HPOs with visit_occurrence: %s' % hpos_with_visit) create_mapping_table(hpos_with_visit, project_id, dataset_id) # before loading [drop and] create all tables to ensure they are set up properly for cdm_file_name in common.CDM_FILES: cdm_table_name = cdm_file_name.split('.')[0] result_table = result_table_for(cdm_table_name) bq_utils.create_standard_table(cdm_table_name, result_table, drop_existing=True) jobs_to_wait_on = [] for table_name in common.CDM_TABLES: q = construct_query(table_name, hpos_to_merge, hpos_with_visit, project_id, dataset_id) logging.info('Merging table: ' + table_name) result_table = result_table_for(table_name) query_result = query(q, destination_table_id=result_table, write_disposition='WRITE_TRUNCATE') query_job_id = query_result['jobReference']['jobId'] jobs_to_wait_on.append(query_job_id) incomplete_jobs = bq_utils.wait_on_jobs(jobs_to_wait_on) if len(incomplete_jobs) == 0: tables_created = [] for job_id in jobs_to_wait_on: job_details = bq_utils.get_job_details(job_id) status = job_details['status'] table = job_details['configuration']['query']['destinationTable'][ 'tableId'] if 'errors' in status: logging.error('Job ID %s errors: %s' % (job_id, status['errors'])) else: tables_created.append(table) return tables_created else: message = "Merge failed because job id(s) %s did not complete." % incomplete_jobs logging.error(message) raise RuntimeError(message)
def _load_datasets(self): """ Load five persons data for nyc and pitt test hpo and rdr data for the excluded_hpo # expected_tables is for testing output # it maps table name to list of expected records ex: "unioned_ehr_visit_occurrence" -> [{}, {}, ...] """ expected_tables: dict = {} running_jobs: list = [] for cdm_table in resources.CDM_TABLES: output_table: str = ehr_union.output_table_for(cdm_table) expected_tables[output_table] = [] for hpo_id in self.hpo_ids: # upload csv into hpo bucket cdm_filename: str = f'{cdm_table}.csv' if hpo_id == NYC_HPO_ID: cdm_filepath: str = os.path.join( test_util.FIVE_PERSONS_PATH, cdm_filename) elif hpo_id == PITT_HPO_ID: cdm_filepath: str = os.path.join( test_util.PITT_FIVE_PERSONS_PATH, cdm_filename) elif hpo_id == EXCLUDED_HPO_ID: if cdm_table in [ 'observation', 'person', 'visit_occurrence' ]: cdm_filepath: str = os.path.join( test_util.RDR_PATH, cdm_filename) bucket: str = gcs_utils.get_hpo_bucket(hpo_id) gcs_bucket = self.storage_client.get_bucket(bucket) if os.path.exists(cdm_filepath): csv_rows = resources.csv_to_list(cdm_filepath) cdm_blob = gcs_bucket.blob(cdm_filename) cdm_blob.upload_from_filename(cdm_filepath) else: # results in empty table cdm_blob = gcs_bucket.blob(cdm_filename) cdm_blob.upload_from_string('dummy\n') csv_rows: list = [] # load table from csv result = bq_utils.load_cdm_csv(hpo_id, cdm_table) running_jobs.append(result['jobReference']['jobId']) if hpo_id != EXCLUDED_HPO_ID: expected_tables[output_table] += list(csv_rows) # ensure person to observation output is as expected output_table_person: str = ehr_union.output_table_for(common.PERSON) output_table_observation: str = ehr_union.output_table_for( common.OBSERVATION) expected_tables[output_table_observation] += 4 * expected_tables[ output_table_person] incomplete_jobs: list = bq_utils.wait_on_jobs(running_jobs) if len(incomplete_jobs) > 0: message: str = "Job id(s) %s failed to complete" % incomplete_jobs raise RuntimeError(message) self.expected_tables = expected_tables
def perform_validation_on_file(file_name, found_file_names, hpo_id, folder_prefix, bucket): """ Attempts to load a csv file into BigQuery :param file_name: name of the file to validate :param found_file_names: files found in the submission folder :param hpo_id: identifies the hpo site :param folder_prefix: directory containing the submission :param bucket: bucket containing the submission :return: tuple (results, errors) where results is list of tuples (file_name, found, parsed, loaded) errors is list of tuples (file_name, message) """ errors = [] results = [] logging.info(f"Validating file '{file_name}'") found = parsed = loaded = 0 table_name = file_name.split('.')[0] if file_name in found_file_names: found = 1 load_results = bq_utils.load_from_csv(hpo_id, table_name, folder_prefix) load_job_id = load_results['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) if not incomplete_jobs: job_resource = bq_utils.get_job_details(job_id=load_job_id) job_status = job_resource['status'] if 'errorResult' in job_status: # These are issues (which we report back) as opposed to internal errors issues = [item['message'] for item in job_status['errors']] errors.append((file_name, ' || '.join(issues))) logging.info( f"Issues found in gs://{bucket}/{folder_prefix}/{file_name}" ) for issue in issues: logging.info(issue) else: # Processed ok parsed = loaded = 1 else: # Incomplete jobs are internal unrecoverable errors. # Aborting the process allows for this submission to be validated when system recovers. message = ( f"Loading hpo_id '{hpo_id}' table '{table_name}' failed because " f"job id '{load_job_id}' did not complete.\n") message += f"Aborting processing 'gs://{bucket}/{folder_prefix}'." logging.error(message) raise InternalValidationError(message) if file_name in common.SUBMISSION_FILES: results.append((file_name, found, parsed, loaded)) return results, errors
def test_integration_queries_to_retract_from_fake_dataset(self, mock_list_datasets, mock_is_ehr_dataset): mock_list_datasets.return_value = [{'id': self.project_id+':'+self.bq_dataset_id}] mock_is_ehr_dataset.return_value = True job_ids = [] row_count_queries = {} # load the cdm files into dataset for cdm_file in test_util.NYC_FIVE_PERSONS_FILES: cdm_file_name = os.path.basename(cdm_file) cdm_table = cdm_file_name.split('.')[0] hpo_table = bq_utils.get_table_id(self.hpo_id, cdm_table) # store query for checking number of rows to delete row_count_queries[hpo_table] = EXPECTED_ROWS_QUERY.format(dataset_id=self.bq_dataset_id, table_id=hpo_table, pids=retract_data_bq.int_list_to_bq( self.person_ids)) retract_data_bq.logger.debug('Preparing to load table %s.%s' % (self.bq_dataset_id, hpo_table)) with open(cdm_file, 'rb') as f: gcs_utils.upload_object(gcs_utils.get_hpo_bucket(self.hpo_id), cdm_file_name, f) result = bq_utils.load_cdm_csv(self.hpo_id, cdm_table, dataset_id=self.bq_dataset_id) retract_data_bq.logger.debug('Loading table %s.%s' % (self.bq_dataset_id, hpo_table)) job_id = result['jobReference']['jobId'] job_ids.append(job_id) incomplete_jobs = bq_utils.wait_on_jobs(job_ids) self.assertEqual(len(incomplete_jobs), 0, 'NYC five person load job did not complete') retract_data_bq.logger.debug('All tables loaded successfully') # use query results to count number of expected row deletions expected_row_count = {} for table in row_count_queries: result = bq_utils.query(row_count_queries[table]) expected_row_count[table] = retract_data_bq.to_int(result['totalRows']) # separate check to find number of actual deleted rows q = TABLE_ROWS_QUERY.format(dataset_id=self.bq_dataset_id) q_result = bq_utils.query(q) result = bq_utils.response2rows(q_result) row_count_before_retraction = {} for row in result: row_count_before_retraction[row['table_id']] = row['row_count'] deid_flag = False # perform retraction retract_data_bq.run_retraction(self.test_project_id, self.person_ids, self.hpo_id, deid_flag) # find actual deleted rows q_result = bq_utils.query(q) result = bq_utils.response2rows(q_result) row_count_after_retraction = {} for row in result: row_count_after_retraction[row['table_id']] = row['row_count'] for table in expected_row_count: self.assertEqual(expected_row_count[table], row_count_before_retraction[table] - row_count_after_retraction[table])
def load_measurement_concept_sets_descendants_table(project_id, dataset_id): """ Loads the measurement_concept_sets_descendants table using LOINC group and LOINC hierarchy into project_id.ehr_ops :param project_id: Project where the dataset resides :param dataset_id: Dataset where the required lab table needs to be created :return: None """ check_and_copy_tables(project_id, dataset_id) identify_labs_query = IDENTIFY_LABS_QUERY.format( project_id=project_id, ehr_ops_dataset_id=dataset_id, vocab_dataset_id=dataset_id, measurement_concept_sets=MEASUREMENT_CONCEPT_SETS_TABLE) try: LOGGER.info(f"Running query {identify_labs_query}") results = bq_utils.query( identify_labs_query, use_legacy_sql=False, destination_table_id=MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE, retry_count=bq_consts.BQ_DEFAULT_RETRY_COUNT, write_disposition=bq_consts.WRITE_TRUNCATE, destination_dataset_id=dataset_id, batch=None) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception(f"FAILED: Clean rule not executed:\n{app_identity}") query_job_id = results['jobReference']['jobId'] bq_utils.wait_on_jobs([query_job_id]) updated_rows = results.get("totalRows") if updated_rows is not None: LOGGER.info( f"Query returned {updated_rows} rows for {dataset_id}.{MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE}" )
def load_test_data(self, hpo_id=None): """ Load to bq test achilles heel results data from csv file :param hpo_id: if specified, prefix to use on csv test file and bq table, otherwise no prefix is used :return: contents of the file as list of objects """ schema_path = os.path.join(resources.fields_path, common.ACHILLES_HEEL_RESULTS + '.json') table_id = common.ACHILLES_HEEL_RESULTS if hpo_id is not None: table_id = bq_utils.get_table_id(hpo_id, common.ACHILLES_HEEL_RESULTS) test_file_name = table_id + '.csv' test_file_path = os.path.join(test_util.TEST_DATA_PATH, test_file_name) test_util.write_cloud_file(self.bucket, test_file_path) gcs_path = 'gs://' + self.bucket + '/' + test_file_name load_results = bq_utils.load_csv(schema_path, gcs_path, self.app_id, self.dataset_id, table_id) job_id = load_results['jobReference']['jobId'] bq_utils.wait_on_jobs([job_id]) return resources._csv_to_list(test_file_path)
def populate_achilles(hpo_id=FAKE_HPO_ID, include_heel=True): from validation import achilles, achilles_heel import app_identity app_id = app_identity.get_application_id() test_resources_bucket = RESOURCES_BUCKET_FMT.format(project_id=app_id) table_names = [ achilles.ACHILLES_ANALYSIS, achilles.ACHILLES_RESULTS, achilles.ACHILLES_RESULTS_DIST ] if include_heel: table_names.append(achilles_heel.ACHILLES_HEEL_RESULTS) running_jobs = [] for table_name in table_names: gcs_path = f'gs://{test_resources_bucket}/{table_name}.csv' dataset_id = bq_utils.get_dataset_id() table_id = bq_utils.get_table_id(hpo_id, table_name) load_results = bq_utils.load_csv(table_name, gcs_path, app_id, dataset_id, table_id) running_jobs.append(load_results['jobReference']['jobId']) bq_utils.wait_on_jobs(running_jobs)
def test_query_result(self): with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp) result = bq_utils.load_cdm_csv(FAKE_HPO_ID, PERSON) load_job_id = result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) self.assertEqual(len(incomplete_jobs), 0, 'loading table {} timed out'.format(PERSON)) table_id = bq_utils.get_table_id(FAKE_HPO_ID, PERSON) q = 'SELECT person_id FROM %s' % table_id result = bq_utils.query(q) self.assertEqual(5, int(result['totalRows']))
def test_load_cdm_csv(self): with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp) result = bq_utils.load_cdm_csv(FAKE_HPO_ID, PERSON) self.assertEqual(result['status']['state'], 'RUNNING') load_job_id = result['jobReference']['jobId'] table_id = result['configuration']['load']['destinationTable']['tableId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) self.assertEqual(len(incomplete_jobs), 0, 'loading table {} timed out'.format(table_id)) table_info = bq_utils.get_table_info(table_id) num_rows = table_info.get('numRows') self.assertEqual(num_rows, '5')
def test_merge_with_good_data(self): running_jobs = [] with open(NYC_FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket('nyc'), 'person.csv', fp) result = bq_utils.load_cdm_csv('nyc', 'person') running_jobs.append(result['jobReference']['jobId']) with open(PITT_FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket('pitt'), 'person.csv', fp) result = bq_utils.load_cdm_csv('pitt', 'person') running_jobs.append(result['jobReference']['jobId']) nyc_person_ids = [ int(row['person_id']) for row in resources._csv_to_list(NYC_FIVE_PERSONS_PERSON_CSV) ] pitt_person_ids = [ int(row['person_id']) for row in resources._csv_to_list(PITT_FIVE_PERSONS_PERSON_CSV) ] expected_result = nyc_person_ids + pitt_person_ids expected_result.sort() incomplete_jobs = bq_utils.wait_on_jobs(running_jobs) self.assertEqual( len(incomplete_jobs), 0, 'loading tables {},{} timed out'.format('nyc_person', 'pitt_person')) dataset_id = bq_utils.get_dataset_id() table_ids = ['nyc_person', 'pitt_person'] merged_table_id = 'merged_nyc_pitt' success_flag, error = bq_utils.merge_tables(dataset_id, table_ids, dataset_id, merged_table_id) self.assertTrue(success_flag) self.assertEqual(error, "") query_string = 'SELECT person_id FROM {dataset_id}.{table_id}'.format( dataset_id=dataset_id, table_id=merged_table_id) merged_query_job_result = bq_utils.query(query_string) self.assertIsNone(merged_query_job_result.get('errors', None)) actual_result = [ int(row['f'][0]['v']) for row in merged_query_job_result['rows'] ] actual_result.sort() self.assertListEqual(expected_result, actual_result)
def test_query_result(self): sc_bucket = self.client.get_bucket(self.hpo_bucket) bucket_blob = sc_bucket.blob('person.csv') with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp: bucket_blob.upload_from_file(fp) result = bq_utils.load_cdm_csv(FAKE_HPO_ID, common.PERSON) load_job_id = result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) self.assertEqual(len(incomplete_jobs), 0, 'loading table {} timed out'.format(common.PERSON)) table_id = bq_utils.get_table_id(FAKE_HPO_ID, common.PERSON) q = 'SELECT person_id FROM %s' % table_id result = bq_utils.query(q) self.assertEqual(5, int(result['totalRows']))
def query(q, dst_table_id, write_disposition='WRITE_APPEND'): """ Run query and block until job is done :param q: SQL statement :param dst_table_id: if set, output is saved in a table with the specified id :param write_disposition: WRITE_TRUNCATE, WRITE_EMPTY, or WRITE_APPEND (default, to preserve schema) """ dst_dataset_id = bq_utils.get_ehr_rdr_dataset_id() query_job_result = bq_utils.query(q, destination_table_id=dst_table_id, write_disposition=write_disposition, destination_dataset_id=dst_dataset_id) query_job_id = query_job_result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([query_job_id]) if len(incomplete_jobs) > 0: raise bq_utils.BigQueryJobWaitError(incomplete_jobs)
def load_dataset_from_files(dataset_id, path, mappings=False): bucket = gcs_utils.get_hpo_bucket(test_util.FAKE_HPO_ID) test_util.empty_bucket(bucket) job_ids = [] for table in resources.CDM_TABLES: job_ids.append(CombineEhrRdrTest._upload_file_to_bucket(bucket, dataset_id, path, table)) if mappings and table in DOMAIN_TABLES: mapping_table = '_mapping_{table}'.format(table=table) job_ids.append(CombineEhrRdrTest._upload_file_to_bucket(bucket, dataset_id, path, mapping_table)) incomplete_jobs = bq_utils.wait_on_jobs(job_ids) if len(incomplete_jobs) > 0: message = "Job id(s) %s failed to complete" % incomplete_jobs raise RuntimeError(message) test_util.empty_bucket(bucket)