def tearDown(self): self._empty_bucket() to_delete_list = gcs_utils.list_bucket(gcs_utils.get_drc_bucket()) for bucket_item in to_delete_list: gcs_utils.delete_object(gcs_utils.get_drc_bucket(), bucket_item['name']) self.testbed.deactivate()
def get_most_recent(app_id=None, drc_bucket=None, report_for=None): """ Query audit logs for paths to the most recent datasources.json files in the DRC bucket. Note: Results are cached in a local json file to avoid unnecessary queries. :param app_id: identifies the GCP project :param drc_bucket: identifies the DRC bucket :param report_for: denotes which query to use b/w achilles and results :return: list of dict with keys `file_path`, `upload_timestamp` """ if app_id is None: app_id = app_identity.get_application_id() if drc_bucket is None: drc_bucket = gcs_utils.get_drc_bucket() if report_for == common.REPORT_FOR_ACHILLES: if not os.path.exists(common.LATEST_REPORTS_JSON): query = LATEST_REPORTS_QUERY.format(app_id=app_id, drc_bucket=drc_bucket, year=common.LOG_YEAR) query_job = bq_utils.query(query) result = bq_utils.response2rows(query_job) with open(common.LATEST_REPORTS_JSON, 'w') as fp: json.dump(result, fp, sort_keys=True, indent=4) with open(common.LATEST_REPORTS_JSON, 'r') as fp: return json.load(fp) elif report_for == common.REPORT_FOR_RESULTS: if not os.path.exists(common.LATEST_RESULTS_JSON): query = LATEST_RESULTS_QUERY.format(app_id=app_id, drc_bucket=drc_bucket, year=common.LOG_YEAR) query_job = bq_utils.query(query) result = bq_utils.response2rows(query_job) with open(common.LATEST_RESULTS_JSON, 'w') as fp: json.dump(result, fp, sort_keys=True, indent=4) with open(common.LATEST_RESULTS_JSON, 'r') as fp: return json.load(fp)
def run_retraction_cron(): project_id = bq_utils.app_identity.get_application_id() hpo_id = bq_utils.get_retraction_hpo_id() person_ids_file = bq_utils.get_retraction_person_ids_file_name() research_ids_file = bq_utils.get_retraction_research_ids_file_name() person_ids = retract_data_bq.extract_pids_from_file(person_ids_file) research_ids = retract_data_bq.extract_pids_from_file(research_ids_file) logging.info('Running retraction on research_ids') retract_data_bq.run_retraction(project_id, research_ids, hpo_id, deid_flag=True) logging.info('Completed retraction on research_ids') logging.info('Running retraction on person_ids') retract_data_bq.run_retraction(project_id, person_ids, hpo_id, deid_flag=False) logging.info('Completed retraction on person_ids') bucket = gcs_utils.get_drc_bucket() hpo_bucket = gcs_utils.get_hpo_bucket(hpo_id) logging.info('Running retraction from bucket folders') retract_data_gcs.run_retraction(person_ids, bucket, hpo_id, hpo_bucket, folder=None, force_flag=True) logging.info('Completed retraction from bucket folders') return 'retraction-complete'
def test_copy_five_persons(self, mock_check_cron): # upload all five_persons files for cdm_pathfile in test_util.FIVE_PERSONS_FILES: test_filename: str = os.path.basename(cdm_pathfile) blob_name: str = f'{self.folder_prefix}{test_filename}' test_blob = self.storage_bucket.blob(blob_name) test_blob.upload_from_filename(cdm_pathfile) blob_name: str = f'{self.folder_prefix}{self.folder_prefix}{test_filename}' test_blob = self.storage_bucket.blob(blob_name) test_blob.upload_from_filename(cdm_pathfile) main.app.testing = True with main.app.test_client() as c: c.get(test_util.COPY_HPO_FILES_URL) prefix = test_util.FAKE_HPO_ID + '/' + self.hpo_bucket + '/' + self.folder_prefix expected_bucket_items = [ prefix + item.split(os.sep)[-1] for item in test_util.FIVE_PERSONS_FILES ] expected_bucket_items.extend([ prefix + self.folder_prefix + item.split(os.sep)[-1] for item in test_util.FIVE_PERSONS_FILES ]) list_bucket_result = gcs_utils.list_bucket( gcs_utils.get_drc_bucket()) actual_bucket_items = [item['name'] for item in list_bucket_result] self.assertSetEqual(set(expected_bucket_items), set(actual_bucket_items))
def setUp(self): self.app_id = app_identity.get_application_id() self.dataset_id = bq_utils.get_dataset_id() self.bucket = gcs_utils.get_drc_bucket() test_util.empty_bucket(self.bucket) test_util.delete_all_tables(self.dataset_id) self.load_test_data(hpo_id=HPO_NYC)
def copy_files(hpo_id): """copies over files from hpo bucket to drc bucket :hpo_id: hpo from which to copy :return: json string indicating the job has finished """ hpo_bucket = gcs_utils.get_hpo_bucket(hpo_id) drc_private_bucket = gcs_utils.get_drc_bucket() bucket_items = list_bucket(hpo_bucket) ignored_items = 0 filtered_bucket_items = [] for item in bucket_items: item_root = item['name'].split('/')[0] + '/' if item_root.lower() in common.IGNORE_DIRECTORIES: ignored_items += 1 else: filtered_bucket_items.append(item) logging.info("Ignoring %d items in %s", ignored_items, hpo_bucket) prefix = hpo_id + '/' + hpo_bucket + '/' for item in filtered_bucket_items: item_name = item['name'] gcs_utils.copy_object(source_bucket=hpo_bucket, source_object_id=item_name, destination_bucket=drc_private_bucket, destination_object_id=prefix + item_name) return '{"copy-status": "done"}'
def write_results_to_drc_bucket(project, validation_dataset=None): """ Write the results of participant matching to the drc bucket. :param project: a string representing the project name :param validation_dataset: the identifier for the match values destination dataset :return: None :raises: RuntimeError if validation_dataset is not defined. """ LOGGER.info('Writing to the DRC bucket') if validation_dataset is None: LOGGER.error('Validation_dataset name is not defined.') raise RuntimeError('validation_dataset name cannot be None.') date_string = _get_date_string(validation_dataset) hpo_sites = readers.get_hpo_site_names() # generate aggregate site report bucket = gcs_utils.get_drc_bucket() filename = os.path.join(validation_dataset, consts.REPORT_DIRECTORY.format(date=date_string), consts.REPORT_TITLE) _, errors = writers.create_site_validation_report(project, validation_dataset, hpo_sites, bucket, filename) if errors > 0: LOGGER.error( f"Encountered {errors} read errors when writing drc report")
def test_copy_five_persons(self, mock_check_cron): # upload all five_persons files for cdm_file in test_util.FIVE_PERSONS_FILES: test_util.write_cloud_file(self.hpo_bucket, cdm_file, prefix=self.folder_prefix) test_util.write_cloud_file(self.hpo_bucket, cdm_file, prefix=self.folder_prefix + self.folder_prefix) main.app.testing = True with main.app.test_client() as c: c.get(test_util.COPY_HPO_FILES_URL) prefix = test_util.FAKE_HPO_ID + '/' + self.hpo_bucket + '/' + self.folder_prefix expected_bucket_items = [ prefix + item.split(os.sep)[-1] for item in test_util.FIVE_PERSONS_FILES ] expected_bucket_items.extend([ prefix + self.folder_prefix + item.split(os.sep)[-1] for item in test_util.FIVE_PERSONS_FILES ]) list_bucket_result = gcs_utils.list_bucket( gcs_utils.get_drc_bucket()) actual_bucket_items = [item['name'] for item in list_bucket_result] self.assertSetEqual(set(expected_bucket_items), set(actual_bucket_items))
def tearDown(self): self._empty_bucket() bucket_nyc = gcs_utils.get_hpo_bucket('nyc') test_util.empty_bucket(bucket_nyc) test_util.empty_bucket(gcs_utils.get_drc_bucket()) test_util.delete_all_tables(self.bigquery_dataset_id) self.testbed.deactivate()
def setUp(self): super(CombineEhrRdrTest, self).setUp() self.APP_ID = bq_utils.app_identity.get_application_id() self.ehr_dataset_id = bq_utils.get_dataset_id() self.rdr_dataset_id = bq_utils.get_rdr_dataset_id() self.combined_dataset_id = bq_utils.get_ehr_rdr_dataset_id() self.drc_bucket = gcs_utils.get_drc_bucket() test_util.delete_all_tables(self.combined_dataset_id)
def setUp(self): self.project_id = app_identity.get_application_id() self.dataset_id = bq_utils.get_dataset_id() self.bucket: str = gcs_utils.get_drc_bucket() self.storage_client = StorageClient(self.project_id) self.storage_client.empty_bucket(self.bucket) test_util.delete_all_tables(self.dataset_id) self.load_test_data(hpo_id=HPO_NYC)
def _test_site_generation(self, mock_check_cron): self._empty_drc_bucket() self._empty_hpo_buckets() with main.app.test_request_context(): result = main._generate_site() self.assertEquals(result, 'okay') # verify that page worked bucket = gcs_utils.get_drc_bucket() expected_files = map(lambda n: n + '.html', main.PAGE_NAMES) + [common.LOG_JSON] file_count = 0 for stat in gcs_utils.list_bucket(bucket): filename = stat['name'] self.assertIn(filename, expected_files) file_count += 1 self.assertEquals(file_count, len(expected_files))
def copy_files(hpo_id): """copies over files from hpo bucket to drc bucket :hpo_id: hpo from which to copy """ hpo_bucket = gcs_utils.get_hpo_bucket(hpo_id) drc_private_bucket = gcs_utils.get_drc_bucket() bucket_items = gcs_utils.list_bucket(hpo_bucket) prefix = hpo_id + '/' + hpo_bucket + '/' for item in bucket_items: item_name = item['name'] gcs_utils.copy_object(source_bucket=hpo_bucket, source_object_id=item_name, destination_bucket=drc_private_bucket, destination_object_id=prefix + item_name) return '{"copy-status": "done"}'
def _generate_site(): """ Construct html pages for each report, data_model, file_transfer and index. :param : :return: returns 'okay' if succesful. logs critical error otherwise. """ bucket = gcs_utils.get_drc_bucket() for page_name in PAGE_NAMES: # generate the page html = to_html(page_name) # write it to the drc shared bucket file_name = page_name + '.html' fp = StringIO.StringIO(html) gcs_utils.upload_object(bucket, file_name, fp) # aggregate result logs and write to bucket full_result_log = get_full_result_log() content = json.dumps(full_result_log) fp = StringIO.StringIO(content) gcs_utils.upload_object(bucket, LOG_JSON, fp) return 'okay'
def tearDown(self): self.storage_client.empty_bucket(self.hpo_bucket) bucket_nyc = gcs_utils.get_hpo_bucket('nyc') self.storage_client.empty_bucket(bucket_nyc) self.storage_client.empty_bucket(gcs_utils.get_drc_bucket()) test_util.delete_all_tables(self.bigquery_dataset_id)
def tearDown(self): self._empty_bucket() bucket_nyc = gcs_utils.get_hpo_bucket('nyc') test_util.empty_bucket(bucket_nyc) test_util.empty_bucket(gcs_utils.get_drc_bucket()) self.testbed.deactivate()
def setUp(self): self.ehr_dataset_id = bq_utils.get_dataset_id() self.rdr_dataset_id = bq_utils.get_rdr_dataset_id() self.combined_dataset_id = bq_utils.get_combined_dataset_id() self.drc_bucket = gcs_utils.get_drc_bucket() test_util.delete_all_tables(self.combined_dataset_id)
def _empty_drc_bucket(self): bucket = gcs_utils.get_drc_bucket() self._empty_bucket(bucket)
def write_to_result_table(project, dataset, site, match_values): """ Append items in match_values to the table generated from site name. Attempts to limit the insert query to less than 1MB. :param site: string identifier for the hpo site. :param match_values: dictionary of person_ids and match values for a field :param project: the project BigQuery project name :param dataset: name of the dataset containing the table to append to :return: query results value :raises: oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError """ if not match_values: LOGGER.info(f"No values to insert for site: {site}") return None result_table = site + consts.VALIDATION_TABLE_SUFFIX bucket = gcs_utils.get_drc_bucket() path = dataset + '/intermediate_results/' + site + '.csv' field_list = [consts.PERSON_ID_FIELD] field_list.extend(consts.VALIDATION_FIELDS) field_list.append(consts.ALGORITHM_FIELD) results = StringIO() field_list_str = ','.join(field_list) + '\n' results.write(field_list_str) LOGGER.info(f"Generating csv values to write to storage for site: {site}") for person_key, person_values in match_values.items(): str_list = [str(person_key)] for field in consts.VALIDATION_FIELDS: value = str(person_values.get(field, consts.MISSING)) str_list.append(value) str_list.append(consts.YES) val_str = ','.join(str_list) results.write(val_str + '\n') LOGGER.info(f"Writing csv file to cloud storage for site: {site}") # write results results.seek(0) gcs_utils.upload_object(bucket, path, results) results.close() LOGGER.info( f"Wrote {len(match_values)} items to cloud storage for site: {site}") # wait on results to be written schema_path = os.path.join(fields_path, 'identity_match.json') LOGGER.info( f"Beginning load of identity match values from csv into BigQuery " "for site: {site}") try: # load csv file into bigquery results = bq_utils.load_csv(schema_path, 'gs://' + bucket + '/' + path, project, dataset, result_table, write_disposition=consts.WRITE_TRUNCATE) # ensure the load job finishes query_job_id = results['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([query_job_id]) if incomplete_jobs != []: raise bq_utils.BigQueryJobWaitError(incomplete_jobs) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( f"Encountered an exception when loading records from csv for site: {site}" ) raise LOGGER.info(f"Loaded match values for site: {site}") return results
import json import os import gcs_utils from tools.consolidated_reports import query_reports from io import open DRC_BUCKET_PATH = 'gs://%s/' % gcs_utils.get_drc_bucket() DATASOURCES_PATH = 'curation_report/data/datasources.json' def get_hpo_id(p): rel_path = p[len(DRC_BUCKET_PATH):] return rel_path[:rel_path.index('/')] def get_report_path(p, hpo_id): return p.replace('datasources.json', hpo_id) def get_submission_name(p): parts = p.split('/') for i in range(0, len(parts)): part = parts[i] if part == 'curation_report': return parts[i - 1] raise RuntimeError('Invalid submission path: %s' % p) def transform_bq_list(uploads): """
def run_gcs_retraction(project_id, sandbox_dataset_id, pid_table_id, hpo_id, folder, force_flag): """ Retract from a folder/folders in a GCS bucket all records associated with a pid :param project_id: project contaning the sandbox dataset :param sandbox_dataset_id: dataset containing the pid_table :param pid_table_id: table containing the person_ids whose data needs to be retracted :param hpo_id: hpo_id of the site to run retraction on :param folder: the site's submission folder; if set to 'all_folders', retract from all folders by the site :param force_flag: if False then prompt for each file :return: metadata for each object updated in order to retract as a list of lists """ # extract the pids pids = extract_pids_from_table(project_id, sandbox_dataset_id, pid_table_id) bucket = gcs_utils.get_drc_bucket() logger.info('Retracting from bucket %s' % bucket) site_bucket = gcs_utils.get_hpo_bucket(hpo_id) full_bucket_path = bucket + '/' + hpo_id + '/' + site_bucket folder_prefixes = gcs_utils.list_bucket_prefixes(full_bucket_path) # retract from latest folders first folder_prefixes.sort(reverse=True) result_dict = {} if folder == 'all_folders': to_process_folder_list = folder_prefixes else: folder_path = full_bucket_path + '/' + folder if folder[ -1] == '/' else full_bucket_path + '/' + folder + '/' if folder_path in folder_prefixes: to_process_folder_list = [folder_path] else: logger.info('Folder %s does not exist in %s. Exiting' % (folder, full_bucket_path)) return result_dict logger.info("Retracting data from the following folders:") logger.info([ bucket + '/' + folder_prefix for folder_prefix in to_process_folder_list ]) for folder_prefix in to_process_folder_list: logger.info('Processing gs://%s/%s' % (bucket, folder_prefix)) # separate cdm from the unknown (unexpected) files bucket_items = gcs_utils.list_bucket_dir(bucket + '/' + folder_prefix[:-1]) found_files = [] folder_items = [ item['name'].split('/')[-1] for item in bucket_items if item['name'].startswith(folder_prefix) ] for item in folder_items: # Only retract from CDM or PII files item = item.lower() if item in resources.CDM_FILES or item in common.PII_FILES: found_files.append(item) logger.info('Found the following files to retract data from:') logger.info([ bucket + '/' + folder_prefix + file_name for file_name in found_files ]) logger.info("Proceed?") if force_flag: logger.info( "Attempting to force retract for folder %s in bucket %s" % (folder_prefix, bucket)) response = "Y" else: # Make sure user types Y to proceed response = get_response() if response == "Y": folder_upload_output = retract(pids, bucket, found_files, folder_prefix, force_flag) result_dict[folder_prefix] = folder_upload_output logger.info("Retraction completed for folder %s/%s " % (bucket, folder_prefix)) elif response.lower() == "n": logger.info("Skipping folder %s" % folder_prefix) logger.info("Retraction from GCS complete") return result_dict
def match_participants(project, rdr_dataset, ehr_dataset, dest_dataset_id): """ Entry point for performing participant matching of PPI, EHR, and PII data. :param project: a string representing the project name :param rdr_dataset: the dataset created from the results given to us by the rdr team :param ehr_dataset: the dataset containing the pii information for comparisons :param dest_dataset_id: the desired identifier for the match values destination dataset :return: results of the field comparison for each hpo """ date_string = _get_date_string(rdr_dataset) if not re.match(consts.DRC_DATE_REGEX, dest_dataset_id[-8:]): dest_dataset_id += date_string # create new dataset for the intermediate tables and results dataset_result = bq_utils.create_dataset( dataset_id=dest_dataset_id, description=consts.DESTINATION_DATASET_DESCRIPTION.format( version='', rdr_dataset=rdr_dataset, ehr_dataset=ehr_dataset ), overwrite_existing=True) validation_dataset = dataset_result.get(bq_consts.DATASET_REF, {}) validation_dataset = validation_dataset.get(bq_consts.DATASET_ID, '') # create intermediate observation table in new dataset readers.create_match_values_table(project, rdr_dataset, dest_dataset_id) hpo_sites = readers.get_hpo_site_names() #TODO: create a proper config file to store this path field_list = resources.fields_for('identity_match') for site_name in hpo_sites: bq_utils.create_table( site_name + consts.VALIDATION_TABLE_SUFFIX, field_list, drop_existing=True, dataset_id=validation_dataset ) results = {} # validate first names for site in hpo_sites: match_values = _compare_name_fields( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_NAME_FIRST, consts.FIRST_NAME_FIELD ) writers.append_to_result_table( site, match_values, project, validation_dataset, consts.FIRST_NAME_FIELD ) # validate last names for site in hpo_sites: match_values = _compare_name_fields( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_NAME_LAST, consts.LAST_NAME_FIELD ) # write last name matches for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.LAST_NAME_FIELD ) # validate middle names for site in hpo_sites: match_values = _compare_name_fields( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_NAME_MIDDLE, consts.MIDDLE_NAME_FIELD ) # write middle name matches for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.MIDDLE_NAME_FIELD ) # validate zip codes for site in hpo_sites: match_values = _compare_zip_codes( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_ZIP, consts.ZIP_CODE_FIELD ) # write zip codes matces for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.ZIP_CODE_FIELD ) # validate city for site in hpo_sites: match_values = _compare_cities( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_CITY, consts.CITY_FIELD ) # write city matches for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.CITY_FIELD ) # validate state for site in hpo_sites: match_values = _compare_states( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_STATE, consts.STATE_FIELD ) # write state matches for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.STATE_FIELD ) # validate street addresses for site in hpo_sites: address_one_matches, address_two_matches = _compare_street_addresses( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_ONE, consts.OBS_PII_STREET_ADDRESS_TWO, consts.ADDRESS_ONE_FIELD, consts.ADDRESS_TWO_FIELD ) # write street address matches for hpo to table writers.append_to_result_table( site, address_one_matches, project, validation_dataset, consts.ADDRESS_ONE_FIELD ) writers.append_to_result_table( site, address_two_matches, project, validation_dataset, consts.ADDRESS_TWO_FIELD ) # validate email addresses for site in hpo_sites: match_values = _compare_email_addresses( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_EMAIL_ADDRESS, consts.EMAIL_FIELD ) # write email matches for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.EMAIL_FIELD ) # validate phone numbers for site in hpo_sites: match_values = _compare_phone_numbers( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_PHONE, consts.PHONE_NUMBER_FIELD ) # write phone number matches for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.PHONE_NUMBER_FIELD ) # validate genders for site in hpo_sites: match_values = _compare_genders( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_SEX ) # write birthday match for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.SEX_FIELD ) # validate birth dates for site in hpo_sites: match_values = _compare_birth_dates( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_BIRTH_DATETIME ) # write birthday match for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.BIRTH_DATE_FIELD ) # generate single clean record for each participant at each site for site in hpo_sites: writers.merge_fields_into_single_record(project, validation_dataset, site) writers.remove_sparse_records(project, validation_dataset, site) writers.change_nulls_to_missing_value(project, validation_dataset, site) # generate hpo site reports for site in hpo_sites: bucket = gcs_utils.get_hpo_bucket(site) filename = os.path.join( consts.REPORT_DIRECTORY.format(date=date_string), consts.REPORT_TITLE ) writers.create_site_validation_report( project, validation_dataset, [site], bucket, filename ) # generate aggregate site report bucket = gcs_utils.get_drc_bucket() filename = os.path.join(validation_dataset, consts.REPORT_TITLE) writers.create_site_validation_report( project, validation_dataset, hpo_sites, bucket, filename ) return results