def test_execute_queries(self): project_id = bq_utils.app_identity.get_application_id() dataset_id = bq_utils.get_combined_dataset_id() test_util.delete_all_tables(dataset_id) create_tables = ( ['person'] + common.CLINICAL_DATA_TABLES + ['_mapping_' + t for t in common.MAPPED_CLINICAL_DATA_TABLES]) # TODO(calbach): Make the setup/teardown of these concept tables hermetic. for tbl in ['concept', 'concept_ancestor']: if not bq_utils.table_exists(tbl, dataset_id=dataset_id): create_tables.push(tbl) for tbl in create_tables: bq_utils.create_standard_table(tbl, tbl, dataset_id=dataset_id, force_all_nullable=True) for tmpl in INSERT_FAKE_PARTICIPANTS_TMPLS: resp = bq_utils.query( tmpl.render(project_id=project_id, dataset_id=dataset_id, rdr_basics_concept_id=123, rdr_consent_concept_id=345, ehr_obs_concept_id=567, rdr_basics_module_concept_id= drop_participants_without_ppi_or_ehr. BASICS_MODULE_CONCEPT_ID)) self.assertTrue(resp["jobComplete"]) queries = drop_participants_without_ppi_or_ehr.get_queries( project_id, dataset_id) clean_cdr_engine.clean_dataset(project_id, queries) def table_to_person_ids(t): rows = bq_utils.response2rows( bq_utils.query("SELECT person_id FROM `{}.{}.{}`".format( project_id, dataset_id, t))) return set([r["person_id"] for r in rows]) # We expect participants 1, 5 to have been removed from all tables. self.assertEqual(set([2, 3, 4, 6]), table_to_person_ids("person")) self.assertEqual(set([2, 4, 6]), table_to_person_ids("observation")) self.assertEquals(set([3, 4]), table_to_person_ids("drug_exposure")) test_util.delete_all_tables(dataset_id)
def validate_submission(hpo_id, bucket, bucket_items, folder_prefix): logging.info('Validating %s submission in gs://%s/%s', hpo_id, bucket, folder_prefix) # separate cdm from the unknown (unexpected) files found_cdm_files = [] unknown_files = [] found_pii_files = [] folder_items = [item['name'][len(folder_prefix):] \ for item in bucket_items if item['name'].startswith(folder_prefix)] for item in folder_items: if _is_cdm_file(item): found_cdm_files.append(item) elif _is_pii_file(item): found_pii_files.append(item) else: if not (_is_known_file(item) or _is_string_excluded_file(item)): unknown_files.append(item) errors = [] results = [] # Create all tables first to simplify downstream processes # (e.g. ehr_union doesn't have to check if tables exist) for file_name in resources.CDM_FILES + common.PII_FILES: table_name = file_name.split('.')[0] table_id = bq_utils.get_table_id(hpo_id, table_name) bq_utils.create_standard_table(table_name, table_id, drop_existing=True) for cdm_file_name in sorted(resources.CDM_FILES): file_results, file_errors = perform_validation_on_file( cdm_file_name, found_cdm_files, hpo_id, folder_prefix, bucket) results.extend(file_results) errors.extend(file_errors) for pii_file_name in sorted(common.PII_FILES): file_results, file_errors = perform_validation_on_file( pii_file_name, found_pii_files, hpo_id, folder_prefix, bucket) results.extend(file_results) errors.extend(file_errors) # (filename, message) for each unknown file warnings = [(unknown_file, common.UNKNOWN_FILE) for unknown_file in unknown_files] return dict(results=results, errors=errors, warnings=warnings)
def validate_submission(hpo_id, bucket, folder_items, folder_prefix): """ Load submission in BigQuery and summarize outcome :param hpo_id: :param bucket: :param folder_items: :param folder_prefix: :return: a dict with keys results, errors, warnings results is list of tuples (file_name, found, parsed, loaded) errors and warnings are both lists of tuples (file_name, message) """ logging.info( f"Validating {hpo_id} submission in gs://{bucket}/{folder_prefix}") # separate cdm from the unknown (unexpected) files found_cdm_files, found_pii_files, unknown_files = categorize_folder_items( folder_items) errors = [] results = [] # Create all tables first to simplify downstream processes # (e.g. ehr_union doesn't have to check if tables exist) for file_name in resources.CDM_FILES + common.PII_FILES: table_name = file_name.split('.')[0] table_id = bq_utils.get_table_id(hpo_id, table_name) bq_utils.create_standard_table(table_name, table_id, drop_existing=True) for cdm_file_name in sorted(resources.CDM_FILES): file_results, file_errors = perform_validation_on_file( cdm_file_name, found_cdm_files, hpo_id, folder_prefix, bucket) results.extend(file_results) errors.extend(file_errors) for pii_file_name in sorted(common.PII_FILES): file_results, file_errors = perform_validation_on_file( pii_file_name, found_pii_files, hpo_id, folder_prefix, bucket) results.extend(file_results) errors.extend(file_errors) # (filename, message) for each unknown file warnings = [(unknown_file, common.UNKNOWN_FILE) for unknown_file in unknown_files] return dict(results=results, errors=errors, warnings=warnings)
def get_domain_mapping_queries(project_id, dataset_id): """ This function generates a list of query dicts for creating id mappings in _logging_domain_alignment. The list will get consumed clean_engine :param project_id: the project_id in which the query is run :param dataset_id: the dataset_id in which the query is run :return: a list of query dicts for creating id mappings in _logging_domain_alignment """ # Create _logging_domain_alignment bq_utils.create_standard_table(DOMAIN_ALIGNMENT_TABLE_NAME, DOMAIN_ALIGNMENT_TABLE_NAME, drop_existing=True, dataset_id=dataset_id) queries = [] for domain_table in domain_mapping.DOMAIN_TABLE_NAMES: query = dict() query[cdr_consts.QUERY] = parse_domain_mapping_query_cross_domain( project_id, dataset_id, domain_table) query[cdr_consts.DESTINATION_TABLE] = DOMAIN_ALIGNMENT_TABLE_NAME query[cdr_consts.DISPOSITION] = bq_consts.WRITE_APPEND query[cdr_consts.DESTINATION_DATASET] = dataset_id queries.append(query) # Create the query for creating field_mappings for the records moving between the same domain query = dict() query[cdr_consts.QUERY] = parse_domain_mapping_query_for_same_domains( project_id, dataset_id) query[cdr_consts.DESTINATION_TABLE] = DOMAIN_ALIGNMENT_TABLE_NAME query[cdr_consts.DISPOSITION] = bq_consts.WRITE_APPEND query[cdr_consts.DESTINATION_DATASET] = dataset_id queries.append(query) # Create the query for the records that are in the wrong domain but will not be moved query = dict() query[cdr_consts.QUERY] = parse_domain_mapping_query_for_excluded_records( project_id, dataset_id) query[cdr_consts.DESTINATION_TABLE] = DOMAIN_ALIGNMENT_TABLE_NAME query[cdr_consts.DISPOSITION] = bq_consts.WRITE_APPEND query[cdr_consts.DESTINATION_DATASET] = dataset_id queries.append(query) return queries
def test_create_standard_table(self): standard_tables = list(resources.CDM_TABLES) + ACHILLES_TABLES for standard_table in standard_tables: table_id = f'prefix_for_test_{standard_table}' result = bq_utils.create_standard_table(standard_table, table_id) self.assertTrue('kind' in result) self.assertEqual(result['kind'], 'bigquery#table') # sanity check self.assertTrue(bq_utils.table_exists(table_id))
def _load_dataset(self, hpo_id): for cdm_table in resources.CDM_TABLES: cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_table + '.csv') if os.path.exists(cdm_file_name): test_util.write_cloud_file(self.hpo_bucket, cdm_file_name) else: test_util.write_cloud_str(self.hpo_bucket, cdm_table + '.csv', 'dummy\n') bq_utils.load_cdm_csv(hpo_id, cdm_table) # ensure concept table exists if not bq_utils.table_exists(common.CONCEPT): bq_utils.create_standard_table(common.CONCEPT, common.CONCEPT) q = """INSERT INTO {dataset}.concept SELECT * FROM {vocab}.concept""".format( dataset=self.dataset, vocab=common.VOCABULARY_DATASET) bq_utils.query(q)
def create_empty_cdm_tables(snapshot_dataset_id, dataset_id): """ Copy the table content from the current dataset to the snapshot dataset :param snapshot_dataset_id: :param dataset_id :return: """ for table in resources.CDM_TABLES: if table == PERSON and has_at_birth_column(dataset_id): table_id = table table_name = 'post_deid_person' else: table_id = table table_name = table create_standard_table(table_name, table_id, drop_existing=True, dataset_id=snapshot_dataset_id) cdm.create_vocabulary_tables(snapshot_dataset_id)
def _load_dataset(self, hpo_id): for cdm_table in resources.CDM_TABLES: cdm_filename: str = f'{cdm_table}.csv' cdm_filepath: str = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_filename) bucket = self.storage_client.get_bucket(self.hpo_bucket) cdm_blob = bucket.blob(cdm_filename) if os.path.exists(cdm_filepath): cdm_blob.upload_from_filename(cdm_filepath) else: cdm_blob.upload_from_string('dummy\n') bq_utils.load_cdm_csv(hpo_id, cdm_table) # ensure concept table exists if not bq_utils.table_exists(common.CONCEPT): bq_utils.create_standard_table(common.CONCEPT, common.CONCEPT) q = """INSERT INTO {dataset}.concept SELECT * FROM {vocab}.concept""".format( dataset=self.dataset, vocab=common.VOCABULARY_DATASET) bq_utils.query(q)
def _create_drug_class_table(bigquery_dataset_id): table_name = 'drug_class' fields = [{ "type": "integer", "name": "concept_id", "mode": "required" }, { "type": "string", "name": "concept_name", "mode": "required" }, { "type": "string", "name": "drug_class_name", "mode": "required" }] bq_utils.create_table(table_id=table_name, fields=fields, drop_existing=True, dataset_id=bigquery_dataset_id) bq_utils.query(q=main_consts.DRUG_CLASS_QUERY.format( dataset_id=bigquery_dataset_id), use_legacy_sql=False, destination_table_id='drug_class', retry_count=bq_consts.BQ_DEFAULT_RETRY_COUNT, write_disposition='WRITE_TRUNCATE', destination_dataset_id=bigquery_dataset_id) # ensure concept ancestor table exists if not bq_utils.table_exists(common.CONCEPT_ANCESTOR): bq_utils.create_standard_table(common.CONCEPT_ANCESTOR, common.CONCEPT_ANCESTOR) q = """INSERT INTO {dataset}.concept_ancestor SELECT * FROM {vocab}.concept_ancestor""".format( dataset=bigquery_dataset_id, vocab=common.VOCABULARY_DATASET) bq_utils.query(q)
def main(input_dataset_id, output_dataset_id, project_id, hpo_ids=None): """ Create a new CDM which is the union of all EHR datasets submitted by HPOs :param input_dataset_id identifies a dataset containing multiple CDMs, one for each HPO submission :param output_dataset_id identifies the dataset to store the new CDM in :param project_id: project containing the datasets :param hpo_ids: (optional) identifies HPOs to process, by default process all :returns: list of tables generated successfully """ logging.info('EHR union started') if hpo_ids is None: hpo_ids = [item['hpo_id'] for item in resources.hpo_csv()] # Create empty output tables to ensure proper schema, clustering, etc. for table in common.CDM_TABLES: result_table = output_table_for(table) logging.info('Creating {dataset_id}.{table_id}...'.format( dataset_id=output_dataset_id, table_id=result_table)) bq_utils.create_standard_table(table, result_table, drop_existing=True, dataset_id=output_dataset_id) # Create mapping tables for domain_table in tables_to_map(): logging.info( 'Mapping {domain_table}...'.format(domain_table=domain_table)) mapping(domain_table, hpo_ids, input_dataset_id, output_dataset_id, project_id) # Load all tables with union of submitted tables for table_name in common.CDM_TABLES: logging.info( 'Creating union of table {table}...'.format(table=table_name)) load(table_name, hpo_ids, input_dataset_id, output_dataset_id)
def submit(self, sql, create, dml=None): """ Submit the sql query to create a de-identified table. :param sql: The sql to send. :param create: a flag to identify if this query should create a new table or append to an existing table. :param dml: boolean flag identifying if a statement is a dml statement """ dml = False if dml is None else dml table_name = self.get_tablename() client = bq.Client.from_service_account_json(self.private_key) # # Let's make sure the out dataset exists datasets = list(client.list_datasets()) found = np.sum( [1 for dataset in datasets if dataset.dataset_id == self.odataset]) if not found: dataset = bq.Dataset(client.dataset(self.odataset)) client.create_dataset(dataset) # create the output table if create: LOGGER.info(f"creating new table:\t{self.tablename}") bq_utils.create_standard_table(self.tablename, self.tablename, drop_existing=True, dataset_id=self.odataset) write_disposition = bq_consts.WRITE_EMPTY else: write_disposition = bq_consts.WRITE_APPEND LOGGER.info(f"appending results to table:\t{self.tablename}") job = bq.QueryJobConfig() job.priority = self.priority job.dry_run = True dml_job = None if not dml: job.destination = client.dataset(self.odataset).table( self.tablename) job.use_query_cache = True job.allow_large_results = True job.write_disposition = write_disposition if self.partition: job._properties['timePartitioning'] = {'type': 'DAY'} job._properties['clustering'] = {'field': 'person_id'} else: # create a copy of the job config to use if the dry-run passes dml_job = copy(job) LOGGER.info( f"submitting a dry-run for:\t{self.get_tablename()}\t\tpriority:\t%s\t\tpartition:\t%s", self.priority, self.partition) logpath = os.path.join(self.logpath, self.idataset) try: os.makedirs(logpath) except OSError: # log path already exists and we don't care pass try: response = client.query(sql, location='US', job_config=job) except Exception: LOGGER.exception( f"dry run query failed for:\t{self.get_tablename()}\n" f"\t\tSQL:\t{sql}\n" f"\t\tjob config:\t{job}") else: if response.state == 'DONE': if dml_job: job = dml_job job.dry_run = False LOGGER.info('dry-run passed. submitting query for execution.') response = client.query(sql, location='US', job_config=job) LOGGER.info( f"submitted a bigquery job for table:\t{table_name}\t\t" f"status:\t'pending'\t\tvalue:\t{response.job_id}") self.wait(client, response.job_id)
def run_validation(hpo_id, force_run=False): """ runs validation for a single hpo_id :param hpo_id: which hpo_id to run for :param force_run: if True, process the latest submission whether or not it has already been processed before :raises BucketDoesNotExistError: Raised when a configured bucket does not exist InternalValidationError: Raised when an internal error is encountered during validation """ logging.info(' Validating hpo_id %s' % hpo_id) bucket = gcs_utils.get_hpo_bucket(hpo_id) bucket_items = list_bucket(bucket) to_process_folder_list = _get_to_process_list(bucket, bucket_items, force_run) for folder_prefix in to_process_folder_list: logging.info('Processing gs://%s/%s' % (bucket, folder_prefix)) # separate cdm from the unknown (unexpected) files found_cdm_files = [] unknown_files = [] found_pii_files = [] folder_items = [ item['name'].split('/')[1] for item in bucket_items if item['name'].startswith(folder_prefix) ] for item in folder_items: if _is_cdm_file(item): found_cdm_files.append(item) elif _is_pii_file(item): found_pii_files.append(item) else: is_known_file = item in common.IGNORE_LIST if not is_known_file: unknown_files.append(item) errors = [] results = [] # Create all tables first to simplify downstream processes # (e.g. ehr_union doesn't have to check if tables exist) for file_name in common.CDM_FILES + common.PII_FILES: table_name = file_name.split('.')[0] table_id = bq_utils.get_table_id(hpo_id, table_name) bq_utils.create_standard_table(table_name, table_id, drop_existing=True) for cdm_file_name in common.CDM_FILES: file_results, file_errors = perform_validation_on_file( cdm_file_name, found_cdm_files, hpo_id, folder_prefix, bucket) results.extend(file_results) errors.extend(file_errors) for pii_file_name in common.PII_FILES: file_results, file_errors = perform_validation_on_file( pii_file_name, found_pii_files, hpo_id, folder_prefix, bucket) results.extend(file_results) errors.extend(file_errors) # (filename, message) for each unknown file warnings = [(unknown_file, UNKNOWN_FILE) for unknown_file in unknown_files] # output to GCS _save_result_in_gcs(bucket, folder_prefix + RESULT_CSV, results) _save_errors_warnings_in_gcs(bucket, folder_prefix + ERRORS_CSV, errors, warnings) if all_required_files_loaded(hpo_id, folder_prefix=folder_prefix): run_achilles(hpo_id) run_export(hpo_id=hpo_id, folder_prefix=folder_prefix) logging.info('Uploading achilles index files to `gs://%s/%s`.' % (bucket, folder_prefix)) _upload_achilles_files(hpo_id, folder_prefix) now_datetime_string = datetime.datetime.now().strftime( '%Y-%m-%dT%H:%M:%S') logging.info( 'Processing complete. Saving timestamp %s to `gs://%s/%s`.' % (bucket, now_datetime_string, folder_prefix + common.PROCESSED_TXT)) _write_string_to_file(bucket, folder_prefix + common.PROCESSED_TXT, now_datetime_string)
def run_validation(hpo_id, force_run=False): """ runs validation for a single hpo_id :param hpo_id: which hpo_id to run for :param force_run: if True, process the latest submission whether or not it has already been processed before :raises BucketDoesNotExistError: Raised when a configured bucket does not exist InternalValidationError: Raised when an internal error is encountered during validation """ logging.info(' Validating hpo_id %s' % hpo_id) bucket = gcs_utils.get_hpo_bucket(hpo_id) bucket_items = list_bucket(bucket) to_process_folder_list = _get_to_process_list(bucket, bucket_items, force_run) for folder_prefix in to_process_folder_list: logging.info('Processing gs://%s/%s' % (bucket, folder_prefix)) # separate cdm from the unknown (unexpected) files found_cdm_files = [] unknown_files = [] folder_items = [ item['name'].split('/')[1] for item in bucket_items if item['name'].startswith(folder_prefix) ] for item in folder_items: if _is_cdm_file(item): found_cdm_files.append(item) else: is_known_file = item in common.IGNORE_LIST or is_pii(item) if not is_known_file: unknown_files.append(item) errors = [] results = [] found_cdm_file_names = found_cdm_files # Create all tables first to simplify downstream processes # (e.g. ehr_union doesn't have to check if tables exist) for cdm_file_name in common.CDM_FILES: cdm_table_name = cdm_file_name.split('.')[0] table_id = bq_utils.get_table_id(hpo_id, cdm_table_name) bq_utils.create_standard_table(cdm_table_name, table_id, drop_existing=True) for cdm_file_name in common.CDM_FILES: logging.info('Validating file `{file_name}`'.format( file_name=cdm_file_name)) found = parsed = loaded = 0 cdm_table_name = cdm_file_name.split('.')[0] if cdm_file_name in found_cdm_file_names: found = 1 load_results = bq_utils.load_cdm_csv(hpo_id, cdm_table_name, folder_prefix) load_job_id = load_results['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) if len(incomplete_jobs) == 0: job_resource = bq_utils.get_job_details(job_id=load_job_id) job_status = job_resource['status'] if 'errorResult' in job_status: # These are issues (which we report back) as opposed to internal errors issues = [ item['message'] for item in job_status['errors'] ] errors.append((cdm_file_name, ' || '.join(issues))) logging.info( 'Issues found in gs://{bucket}/{folder_prefix}/{cdm_file_name}' .format(bucket=bucket, folder_prefix=folder_prefix, cdm_file_name=cdm_file_name)) for issue in issues: logging.info(issue) else: # Processed ok parsed = loaded = 1 else: # Incomplete jobs are internal unrecoverable errors. # Aborting the process allows for this submission to be validated when system recovers. message_fmt = 'Loading hpo_id `%s` table `%s` failed because job id `%s` did not complete.' message = message_fmt % (hpo_id, cdm_table_name, load_job_id) message += ' Aborting processing `gs://%s/%s`.' % ( bucket, folder_prefix) logging.error(message) raise InternalValidationError(message) if cdm_file_name in common.REQUIRED_FILES or found: results.append((cdm_file_name, found, parsed, loaded)) # (filename, message) for each unknown file warnings = [(unknown_file, UNKNOWN_FILE) for unknown_file in unknown_files] # output to GCS _save_result_in_gcs(bucket, folder_prefix + RESULT_CSV, results) _save_warnings_in_gcs(bucket, folder_prefix + WARNINGS_CSV, warnings) _save_errors_in_gcs(bucket, folder_prefix + ERRORS_CSV, errors) if all_required_files_loaded(hpo_id, folder_prefix=folder_prefix): run_achilles(hpo_id) run_export(hpo_id=hpo_id, folder_prefix=folder_prefix) logging.info('Uploading achilles index files to `gs://%s/%s`.' % (bucket, folder_prefix)) _upload_achilles_files(hpo_id, folder_prefix) now_datetime_string = datetime.datetime.now().strftime( '%Y-%m-%dT%H:%M:%S') logging.info( 'Processing complete. Saving timestamp %s to `gs://%s/%s`.' % (bucket, now_datetime_string, folder_prefix + common.PROCESSED_TXT)) _write_string_to_file(bucket, folder_prefix + common.PROCESSED_TXT, now_datetime_string)
def submit(self, sql): """ """ table_name = self.get_tablename() client = bq.Client.from_service_account_json(self.private_key) # # Let's make sure the out dataset exists datasets = list(client.list_datasets()) found = np.sum([1 for dataset in datasets if dataset.dataset_id == self.odataset]) if not found: dataset = bq.Dataset(client.dataset(self.odataset)) client.create_dataset(dataset) # create the output table bq_utils.create_standard_table(self.tablename, self.tablename, drop_existing=True, dataset_id=self.odataset) job = bq.QueryJobConfig() job.destination = client.dataset(self.odataset).table(self.tablename) job.use_query_cache = True job.allow_large_results = True if self.partition: job._properties['timePartitioning'] = {'type': 'DAY'} job._properties['clustering'] = {'field': 'person_id'} job.priority = self.priority job.dry_run = True self.log(module='submit-job', subject=self.get_tablename(), action='dry-run', value={'priority': self.priority, 'parition': self.partition}) logpath = os.path.join(self.logpath, self.idataset) try: os.makedirs(logpath) except OSError: # log path already exists and we don't care pass r = client.query(sql, location='US', job_config=job) if r.errors is None and r.state == 'DONE': job.dry_run = False r = client.query(sql, location='US', job_config=job) self.log(module='submit', subject=self.get_tablename(), action='submit-job', table=table_name, status='pending', value=r.job_id, object='bigquery') self.wait(client, r.job_id) # self.finalize(client) # # At this point we must try to partition the table else: self.log(module='submit', subject=self.get_tablename(), action='submit-job', table=table_name, status='error', value=r.errors) print (r.errors)