def test_merge_with_unmatched_schema(self): running_jobs = [] with open(NYC_FIVE_PERSONS_MEASUREMENT_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket('nyc'), 'measurement.csv', fp) result = bq_utils.load_cdm_csv('nyc', 'measurement') running_jobs.append(result['jobReference']['jobId']) with open(PITT_FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket('pitt'), 'person.csv', fp) result = bq_utils.load_cdm_csv('pitt', 'person') running_jobs.append(result['jobReference']['jobId']) incomplete_jobs = bq_utils.wait_on_jobs(running_jobs) self.assertEqual(len(incomplete_jobs), 0, 'loading tables {},{} timed out'.format('nyc_measurement', 'pitt_person')) table_names = ['nyc_measurement', 'pitt_person'] success, error = bq_utils.merge_tables( bq_utils.get_dataset_id(), table_names, bq_utils.get_dataset_id(), 'merged_nyc_pitt' ) self.assertFalse(success)
def test_query_result(self): sc_bucket = self.client.get_bucket(self.hpo_bucket) bucket_blob = sc_bucket.blob('person.csv') with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp: bucket_blob.upload_from_file(fp) result = bq_utils.load_cdm_csv(FAKE_HPO_ID, common.PERSON) load_job_id = result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) self.assertEqual(len(incomplete_jobs), 0, 'loading table {} timed out'.format(common.PERSON)) table_id = bq_utils.get_table_id(FAKE_HPO_ID, common.PERSON) q = 'SELECT person_id FROM %s' % table_id result = bq_utils.query(q) self.assertEqual(5, int(result['totalRows']))
def test_query_result(self): with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp) result = bq_utils.load_cdm_csv(FAKE_HPO_ID, PERSON) load_job_id = result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) self.assertEqual(len(incomplete_jobs), 0, 'loading table {} timed out'.format(PERSON)) table_id = bq_utils.get_table_id(FAKE_HPO_ID, PERSON) q = 'SELECT person_id FROM %s' % table_id result = bq_utils.query(q) self.assertEqual(5, int(result['totalRows']))
def _load_dataset(self, hpo_id): for cdm_table in resources.CDM_TABLES: cdm_filename: str = f'{cdm_table}.csv' cdm_filepath: str = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_filename) bucket = self.storage_client.get_bucket(self.hpo_bucket) cdm_blob = bucket.blob(cdm_filename) if os.path.exists(cdm_filepath): cdm_blob.upload_from_filename(cdm_filepath) else: cdm_blob.upload_from_string('dummy\n') bq_utils.load_cdm_csv(hpo_id, cdm_table) # ensure concept table exists if not bq_utils.table_exists(common.CONCEPT): bq_utils.create_standard_table(common.CONCEPT, common.CONCEPT) q = """INSERT INTO {dataset}.concept SELECT * FROM {vocab}.concept""".format( dataset=self.dataset, vocab=common.VOCABULARY_DATASET) bq_utils.query(q)
def _load_datasets(self): load_jobs = [] for cdm_table in common.CDM_TABLES: cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_table + '.csv') if os.path.exists(cdm_file_name): test_util.write_cloud_file(self.chs_bucket, cdm_file_name) test_util.write_cloud_file(self.pitt_bucket, cdm_file_name) else: test_util.write_cloud_str(self.chs_bucket, cdm_table + '.csv', 'dummy\n') test_util.write_cloud_str(self.pitt_bucket, cdm_table + '.csv', 'dummy\n') chs_load_results = bq_utils.load_cdm_csv(CHS_HPO_ID, cdm_table) pitt_load_results = bq_utils.load_cdm_csv(PITT_HPO_ID, cdm_table) chs_load_job_id = chs_load_results['jobReference']['jobId'] pitt_load_job_id = pitt_load_results['jobReference']['jobId'] load_jobs.append(chs_load_job_id) load_jobs.append(pitt_load_job_id) incomplete_jobs = bq_utils.wait_on_jobs(load_jobs) if len(incomplete_jobs) > 0: raise RuntimeError('BigQuery jobs %s failed to complete' % incomplete_jobs)
def test_load_cdm_csv(self): with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp) result = bq_utils.load_cdm_csv(FAKE_HPO_ID, PERSON) self.assertEqual(result['status']['state'], 'RUNNING') load_job_id = result['jobReference']['jobId'] table_id = result['configuration']['load']['destinationTable'][ 'tableId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) self.assertEqual(len(incomplete_jobs), 0, 'loading table {} timed out'.format(table_id)) table_info = bq_utils.get_table_info(table_id) num_rows = table_info.get('numRows') self.assertEqual(num_rows, '5')
def test_load_cdm_csv(self): with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp) result = bq_utils.load_cdm_csv(FAKE_HPO_ID, PERSON) self.assertEqual(result['status']['state'], 'RUNNING') load_job_id = result['jobReference']['jobId'] table_id = result['configuration']['load']['destinationTable'][ 'tableId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) self.assertEqual(len(incomplete_jobs), 0, 'loading table {} timed out'.format(table_id)) query_response = bq_utils.query('SELECT 1 FROM %(table_id)s' % locals()) self.assertEqual(query_response['totalRows'], '5')
def _load_datasets(self): """ Load five persons data for each test hpo # expected_tables is for testing output # it maps table name to list of expected records ex: "unioned_ehr_visit_occurrence" -> [{}, {}, ...] """ expected_tables = dict() running_jobs = [] for cdm_table in resources.CDM_TABLES: output_table = ehr_union.output_table_for(cdm_table) expected_tables[output_table] = [] for hpo_id in self.hpo_ids: # upload csv into hpo bucket if hpo_id == NYC_HPO_ID: cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_table + '.csv') else: cdm_file_name = os.path.join( test_util.PITT_FIVE_PERSONS_PATH, cdm_table + '.csv') bucket = gcs_utils.get_hpo_bucket(hpo_id) if os.path.exists(cdm_file_name): test_util.write_cloud_file(bucket, cdm_file_name) csv_rows = resources.csv_to_list(cdm_file_name) else: # results in empty table test_util.write_cloud_str(bucket, cdm_table + '.csv', 'dummy\n') csv_rows = [] # load table from csv result = bq_utils.load_cdm_csv(hpo_id, cdm_table) running_jobs.append(result['jobReference']['jobId']) expected_tables[output_table] += list(csv_rows) # ensure person to observation output is as expected output_table_person = ehr_union.output_table_for( combine_ehr_rdr.PERSON_TABLE) output_table_observation = ehr_union.output_table_for( combine_ehr_rdr.OBSERVATION_TABLE) expected_tables[output_table_observation] += 4 * expected_tables[ output_table_person] incomplete_jobs = bq_utils.wait_on_jobs(running_jobs) if len(incomplete_jobs) > 0: message = "Job id(s) %s failed to complete" % incomplete_jobs raise RuntimeError(message) self.expected_tables = expected_tables
def test_load_ehr_observation(self): hpo_id = 'pitt' dataset_id = bq_utils.get_dataset_id() table_id = bq_utils.get_table_id(hpo_id, table_name='observation') q = 'SELECT observation_id FROM {dataset_id}.{table_id} ORDER BY observation_id'.format( dataset_id=dataset_id, table_id=table_id) expected_observation_ids = [int(row['observation_id']) for row in resources._csv_to_list(PITT_FIVE_PERSONS_OBSERVATION_CSV)] with open(PITT_FIVE_PERSONS_OBSERVATION_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket(hpo_id), 'observation.csv', fp) result = bq_utils.load_cdm_csv(hpo_id, 'observation') job_id = result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([job_id]) self.assertEqual(len(incomplete_jobs), 0, 'pitt_observation load job did not complete') load_job_result = bq_utils.get_job_details(job_id) load_job_result_status = load_job_result['status'] load_job_errors = load_job_result_status.get('errors') self.assertIsNone(load_job_errors, msg='pitt_observation load job failed: ' + str(load_job_errors)) query_results_response = bq_utils.query(q) query_job_errors = query_results_response.get('errors') self.assertIsNone(query_job_errors) actual_result = [int(row['f'][0]['v']) for row in query_results_response['rows']] self.assertListEqual(actual_result, expected_observation_ids)
def test_load_cdm_csv_error_on_bad_table_name(self): with self.assertRaises(ValueError) as cm: bq_utils.load_cdm_csv(FAKE_HPO_ID, 'not_a_cdm_table')
def test_integration_queries_to_retract_from_fake_dataset( self, mock_list_datasets, mock_is_ehr_dataset, mock_is_unioned_dataset, mock_is_combined_dataset, mock_is_deid_dataset): mock_list_datasets.return_value = [{ 'id': self.project_id + ':' + self.bq_dataset_id }] mock_is_deid_dataset.return_value = False mock_is_combined_dataset.return_value = False mock_is_unioned_dataset.return_value = False mock_is_ehr_dataset.return_value = True # create and load person_ids to pid table bq_utils.create_table(self.pid_table_id, retract_data_bq.PID_TABLE_FIELDS, drop_existing=True, dataset_id=self.bq_dataset_id) bq_formatted_insert_values = ', '.join([ '(%s, %s)' % (person_id, research_id) for (person_id, research_id) in self.person_research_ids ]) q = INSERT_PID_TABLE.format( dataset_id=self.bq_dataset_id, pid_table_id=self.pid_table_id, person_research_ids=bq_formatted_insert_values) bq_utils.query(q) job_ids = [] row_count_queries = {} # load the cdm files into dataset for cdm_file in test_util.NYC_FIVE_PERSONS_FILES: cdm_file_name = os.path.basename(cdm_file) cdm_table = cdm_file_name.split('.')[0] hpo_table = bq_utils.get_table_id(self.hpo_id, cdm_table) # store query for checking number of rows to delete row_count_queries[hpo_table] = EXPECTED_ROWS_QUERY.format( dataset_id=self.bq_dataset_id, table_id=hpo_table, pid_table_id=self.pid_table_id) retract_data_bq.logger.info('Preparing to load table %s.%s' % (self.bq_dataset_id, hpo_table)) with open(cdm_file, 'rb') as f: gcs_utils.upload_object(gcs_utils.get_hpo_bucket(self.hpo_id), cdm_file_name, f) result = bq_utils.load_cdm_csv(self.hpo_id, cdm_table, dataset_id=self.bq_dataset_id) retract_data_bq.logger.info('Loading table %s.%s' % (self.bq_dataset_id, hpo_table)) job_id = result['jobReference']['jobId'] job_ids.append(job_id) incomplete_jobs = bq_utils.wait_on_jobs(job_ids) self.assertEqual(len(incomplete_jobs), 0, 'NYC five person load job did not complete') retract_data_bq.logger.info('All tables loaded successfully') # use query results to count number of expected row deletions expected_row_count = {} for table in row_count_queries: result = bq_utils.query(row_count_queries[table]) expected_row_count[table] = int(result['totalRows']) # separate check to find number of actual deleted rows q = TABLE_ROWS_QUERY.format(dataset_id=self.bq_dataset_id) q_result = bq_utils.query(q) result = bq_utils.response2rows(q_result) row_count_before_retraction = {} for row in result: row_count_before_retraction[row['table_id']] = row['row_count'] # perform retraction retract_data_bq.run_bq_retraction(self.test_project_id, self.bq_dataset_id, self.test_project_id, self.pid_table_id, self.hpo_id, self.dataset_ids) # find actual deleted rows q_result = bq_utils.query(q) result = bq_utils.response2rows(q_result) row_count_after_retraction = {} for row in result: row_count_after_retraction[row['table_id']] = row['row_count'] for table in expected_row_count: self.assertEqual( expected_row_count[table], row_count_before_retraction[table] - row_count_after_retraction[table])
def run_validation(hpo_id, force_run=False): """ runs validation for a single hpo_id :param hpo_id: which hpo_id to run for :param force_run: if True, process the latest submission whether or not it has already been processed before :raises BucketDoesNotExistError: Raised when a configured bucket does not exist InternalValidationError: Raised when an internal error is encountered during validation """ logging.info(' Validating hpo_id %s' % hpo_id) bucket = gcs_utils.get_hpo_bucket(hpo_id) bucket_items = list_bucket(bucket) to_process_folder_list = _get_to_process_list(bucket, bucket_items, force_run) for folder_prefix in to_process_folder_list: logging.info('Processing gs://%s/%s' % (bucket, folder_prefix)) # separate cdm from the unknown (unexpected) files found_cdm_files = [] unknown_files = [] folder_items = [ item['name'].split('/')[1] for item in bucket_items if item['name'].startswith(folder_prefix) ] for item in folder_items: if _is_cdm_file(item): found_cdm_files.append(item) else: is_known_file = item in common.IGNORE_LIST or is_pii(item) if not is_known_file: unknown_files.append(item) errors = [] results = [] found_cdm_file_names = found_cdm_files # Create all tables first to simplify downstream processes # (e.g. ehr_union doesn't have to check if tables exist) for cdm_file_name in common.CDM_FILES: cdm_table_name = cdm_file_name.split('.')[0] table_id = bq_utils.get_table_id(hpo_id, cdm_table_name) bq_utils.create_standard_table(cdm_table_name, table_id, drop_existing=True) for cdm_file_name in common.CDM_FILES: logging.info('Validating file `{file_name}`'.format( file_name=cdm_file_name)) found = parsed = loaded = 0 cdm_table_name = cdm_file_name.split('.')[0] if cdm_file_name in found_cdm_file_names: found = 1 load_results = bq_utils.load_cdm_csv(hpo_id, cdm_table_name, folder_prefix) load_job_id = load_results['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) if len(incomplete_jobs) == 0: job_resource = bq_utils.get_job_details(job_id=load_job_id) job_status = job_resource['status'] if 'errorResult' in job_status: # These are issues (which we report back) as opposed to internal errors issues = [ item['message'] for item in job_status['errors'] ] errors.append((cdm_file_name, ' || '.join(issues))) logging.info( 'Issues found in gs://{bucket}/{folder_prefix}/{cdm_file_name}' .format(bucket=bucket, folder_prefix=folder_prefix, cdm_file_name=cdm_file_name)) for issue in issues: logging.info(issue) else: # Processed ok parsed = loaded = 1 else: # Incomplete jobs are internal unrecoverable errors. # Aborting the process allows for this submission to be validated when system recovers. message_fmt = 'Loading hpo_id `%s` table `%s` failed because job id `%s` did not complete.' message = message_fmt % (hpo_id, cdm_table_name, load_job_id) message += ' Aborting processing `gs://%s/%s`.' % ( bucket, folder_prefix) logging.error(message) raise InternalValidationError(message) if cdm_file_name in common.REQUIRED_FILES or found: results.append((cdm_file_name, found, parsed, loaded)) # (filename, message) for each unknown file warnings = [(unknown_file, UNKNOWN_FILE) for unknown_file in unknown_files] # output to GCS _save_result_in_gcs(bucket, folder_prefix + RESULT_CSV, results) _save_warnings_in_gcs(bucket, folder_prefix + WARNINGS_CSV, warnings) _save_errors_in_gcs(bucket, folder_prefix + ERRORS_CSV, errors) if all_required_files_loaded(hpo_id, folder_prefix=folder_prefix): run_achilles(hpo_id) run_export(hpo_id=hpo_id, folder_prefix=folder_prefix) logging.info('Uploading achilles index files to `gs://%s/%s`.' % (bucket, folder_prefix)) _upload_achilles_files(hpo_id, folder_prefix) now_datetime_string = datetime.datetime.now().strftime( '%Y-%m-%dT%H:%M:%S') logging.info( 'Processing complete. Saving timestamp %s to `gs://%s/%s`.' % (bucket, now_datetime_string, folder_prefix + common.PROCESSED_TXT)) _write_string_to_file(bucket, folder_prefix + common.PROCESSED_TXT, now_datetime_string)
def test_integration_queries_to_retract_from_fake_dataset( self, mock_retraction_info): d = { 'project_id': [ self.project_id, self.project_id, self.project_id, self.project_id, self.project_id, self.project_id ], 'dataset_id': [ self.bq_dataset_id, self.bq_dataset_id, self.bq_dataset_id, self.bq_dataset_id, self.bq_dataset_id, self.bq_dataset_id ], 'table': [ 'fake_condition_occurrence', 'fake_drug_exposure', 'fake_measurement', 'fake_observation', 'fake_procedure_occurrence', 'fake_visit_occurrence' ], 'date_column': [ None, None, 'measurement_date', 'observation_date', 'procedure_date', None ], 'start_date_column': [ 'condition_start_date', 'drug_exposure_start_date', None, None, None, 'visit_start_date' ], 'end_date_column': [ 'condition_end_date', 'drug_exposure_end_date', None, None, None, 'visit_end_date' ] } retraction_info = pd.DataFrame(data=d) mock_retraction_info.return_value = retraction_info # Create and load person_ids and deactivated_date to pid table bq.create_tables(self.client, self.project_id, self.pid_table_id_list, exists_ok=False, fields=retract_deactivated_pids.PID_TABLE_FIELDS) bq_formatted_insert_values = ', '.join([ '(%s, "%s")' % (person_id, deactivated_date) for (person_id, deactivated_date) in self.deactivated_ehr_participants ]) q = INSERT_PID_TABLE.format( dataset_id=self.bq_dataset_id, pid_table_id=self.pid_table_id, person_research_ids=bq_formatted_insert_values) self.client.query(q) job_ids = [] dropped_row_count_queries = [] kept_row_count_queries = [] hpo_table_list = [] # Load the cdm files into dataset for cdm_file in test_util.NYC_FIVE_PERSONS_FILES: cdm_file_name = os.path.basename(cdm_file) cdm_table = cdm_file_name.split('.')[0] hpo_table = bq_utils.get_table_id(self.hpo_id, cdm_table) # Do not process if person table if hpo_table == 'fake_person': continue hpo_table_list.append(hpo_table) logging.info( f'Preparing to load table {self.bq_dataset_id}.{hpo_table}') with open(cdm_file, 'rb') as f: gcs_utils.upload_object(gcs_utils.get_hpo_bucket(self.hpo_id), cdm_file_name, f) result = bq_utils.load_cdm_csv(self.hpo_id, cdm_table, dataset_id=self.bq_dataset_id) logging.info(f'Loading table {self.bq_dataset_id}.{hpo_table}') job_id = result['jobReference']['jobId'] job_ids.append(job_id) incomplete_jobs = bq_utils.wait_on_jobs(job_ids) self.assertEqual(len(incomplete_jobs), 0, 'NYC five person load job did not complete') logging.info('All tables loaded successfully') # Store query for checking number of rows to delete for ehr in self.deactivated_ehr_participants: pid = ehr[0] for row in retraction_info.itertuples(index=False): if row.date_column is None: dropped_query = EXPECTED_DROPPED_ROWS_QUERY_END_DATE.format( dataset_id=self.bq_dataset_id, table_id=row.table, pid_table_id=self.pid_table_id, pid=pid, start_date_column=row.start_date_column, end_date_column=row.end_date_column) kept_query = EXPECTED_KEPT_ROWS_QUERY_END_DATE.format( dataset_id=self.bq_dataset_id, table_id=row.table, pid_table_id=self.pid_table_id, pid=pid, start_date_column=row.start_date_column, end_date_column=row.end_date_column) else: dropped_query = EXPECTED_DROPPED_ROWS_QUERY.format( dataset_id=self.bq_dataset_id, table_id=row.table, pid_table_id=self.pid_table_id, pid=pid, date_column=row.date_column) kept_query = EXPECTED_KEPT_ROWS_QUERY.format( dataset_id=self.bq_dataset_id, table_id=row.table, pid_table_id=self.pid_table_id, pid=pid, date_column=row.date_column) dropped_row_count_queries.append({ clean_consts.QUERY: dropped_query, clean_consts.DESTINATION_DATASET: self.bq_dataset_id, clean_consts.DESTINATION_TABLE: row.table }) kept_row_count_queries.append({ clean_consts.QUERY: kept_query, clean_consts.DESTINATION_DATASET: self.bq_dataset_id, clean_consts.DESTINATION_TABLE: row.table }) # Use query results to count number of expected dropped row deletions expected_dropped_row_count = {} for query_dict in dropped_row_count_queries: response = self.client.query(query_dict['query']) result = response.result() if query_dict['destination_table_id'] in expected_dropped_row_count: expected_dropped_row_count[ query_dict['destination_table_id']] += result.total_rows else: expected_dropped_row_count[ query_dict['destination_table_id']] = result.total_rows # Separate check to find number of actual deleted rows q = TABLE_ROWS_QUERY.format(dataset_id=self.bq_dataset_id) q_result = self.client.query(q) row_count_before_retraction = {} for row in q_result: row_count_before_retraction[row['table_id']] = row['row_count'] # Use query results to count number of expected dropped row deletions expected_kept_row_count = {} for query_dict in kept_row_count_queries: response = self.client.query(query_dict['query']) result = response.result() if query_dict['destination_table_id'] in expected_kept_row_count: expected_kept_row_count[query_dict['destination_table_id']] -= ( (row_count_before_retraction[ query_dict['destination_table_id']] - result.total_rows)) else: expected_kept_row_count[query_dict['destination_table_id']] = ( row_count_before_retraction[ query_dict['destination_table_id']] - (row_count_before_retraction[ query_dict['destination_table_id']] - result.total_rows)) # Perform retraction query_list = retract_deactivated_pids.create_queries( self.project_id, self.ticket_number, self.project_id, self.bq_dataset_id, self.pid_table_id) retract_deactivated_pids.run_queries(query_list, self.client) # Find actual deleted rows q_result = self.client.query(q) results = q_result.result() row_count_after_retraction = {} for row in results: row_count_after_retraction[row['table_id']] = row['row_count'] for table in expected_dropped_row_count: self.assertEqual( expected_dropped_row_count[table], row_count_before_retraction[table] - row_count_after_retraction[table]) for table in expected_kept_row_count: self.assertEqual(expected_kept_row_count[table], row_count_after_retraction[table])