def merge(dataset_id, project_id): """merge hpo ehr data :dataset_id: source and target dataset :project_id: project in which everything happens :returns: list of tables generated successfully """ logging.info('Starting merge') existing_tables = bq_utils.list_dataset_contents(dataset_id) hpos_to_merge = [] hpos_with_visit = [] for item in resources.hpo_csv(): hpo_id = item['hpo_id'] if hpo_id + '_person' in existing_tables: hpos_to_merge.append(hpo_id) if hpo_id + '_visit_occurrence' in existing_tables: hpos_with_visit.append(hpo_id) logging.info('HPOs to merge: %s' % hpos_to_merge) logging.info('HPOs with visit_occurrence: %s' % hpos_with_visit) create_mapping_table(hpos_with_visit, project_id, dataset_id) # before loading [drop and] create all tables to ensure they are set up properly for cdm_file_name in common.CDM_FILES: cdm_table_name = cdm_file_name.split('.')[0] result_table = result_table_for(cdm_table_name) bq_utils.create_standard_table(cdm_table_name, result_table, drop_existing=True) jobs_to_wait_on = [] for table_name in common.CDM_TABLES: q = construct_query(table_name, hpos_to_merge, hpos_with_visit, project_id, dataset_id) logging.info('Merging table: ' + table_name) result_table = result_table_for(table_name) query_result = query(q, destination_table_id=result_table, write_disposition='WRITE_TRUNCATE') query_job_id = query_result['jobReference']['jobId'] jobs_to_wait_on.append(query_job_id) incomplete_jobs = bq_utils.wait_on_jobs(jobs_to_wait_on) if len(incomplete_jobs) == 0: tables_created = [] for job_id in jobs_to_wait_on: job_details = bq_utils.get_job_details(job_id) status = job_details['status'] table = job_details['configuration']['query']['destinationTable'][ 'tableId'] if 'errors' in status: logging.error('Job ID %s errors: %s' % (job_id, status['errors'])) else: tables_created.append(table) return tables_created else: message = "Merge failed because job id(s) %s did not complete." % incomplete_jobs logging.error(message) raise RuntimeError(message)
def get_full_result_log(): full_log = [] for hpo in resources.hpo_csv(): hpo_id = hpo['hpo_id'] hpo_bucket = gcs_utils.get_hpo_bucket(hpo_id) try: # TODO : figure out possible errors and catch specific bucket inexistence error obj_metadata = gcs_utils.get_metadata(hpo_bucket, RESULT_CSV) except: logging.warning( 'skipping hpo {}. bucket does not exist.'.format(hpo)) continue if obj_metadata is None: logging.info('%s was not found in %s' % (RESULT_CSV, hpo_bucket)) else: hpo_result = gcs_utils.get_object(hpo_bucket, RESULT_CSV) hpo_result_file = StringIO.StringIO(hpo_result) hpo_result_items = resources._csv_file_to_list(hpo_result_file) result_objects = map( lambda item: hpo_log_item_to_obj(hpo_id, item), hpo_result_items) full_log.extend(result_objects) return full_log
def setUp(self): self.project_id = 'project_id' self.dataset_id = 'dataset_id' self.obs_fields = [{ "type": "integer", "name": "observation_id", "mode": "nullable", "description": "The observation_id used in the observation table." }, { "type": "string", "name": "src_id", "mode": "nullable", "description": "The provenance of the data associated with the observation_id." }] self.hpo_list = resources.hpo_csv() self.mapping_tables = [ gen_ext.MAPPING_PREFIX + cdm_table for cdm_table in common.AOU_REQUIRED if cdm_table not in [common.PERSON, common.DEATH, common.FACT_RELATIONSHIP] ] self.bq_string = '("{hpo_name}", "EHR site nnn")'
def main(input_dataset_id, output_dataset_id, project_id, hpo_ids=None): """ Create a new CDM which is the union of all EHR datasets submitted by HPOs :param input_dataset_id identifies a dataset containing multiple CDMs, one for each HPO submission :param output_dataset_id identifies the dataset to store the new CDM in :param project_id: project containing the datasets :param hpo_ids: (optional) identifies HPOs to process, by default process all :returns: list of tables generated successfully """ logging.info('EHR union started') if hpo_ids is None: hpo_ids = [item['hpo_id'] for item in resources.hpo_csv()] # Create empty output tables to ensure proper schema, clustering, etc. for table in common.CDM_TABLES: result_table = output_table_for(table) logging.info('Creating {dataset_id}.{table_id}...'.format(dataset_id=output_dataset_id, table_id=result_table)) bq_utils.create_standard_table(table, result_table, drop_existing=True, dataset_id=output_dataset_id) # Create mapping tables for domain_table in tables_to_map(): logging.info('Mapping {domain_table}...'.format(domain_table=domain_table)) mapping(domain_table, hpo_ids, input_dataset_id, output_dataset_id, project_id) # Load all tables with union of submitted tables for table_name in common.CDM_TABLES: logging.info('Creating union of table {table}...'.format(table=table_name)) load(table_name, hpo_ids, input_dataset_id, output_dataset_id)
def get_hpo_ids(): """ Get identifiers for all HPO sites :return: A list of HPO ids """ return [hpo_item[consts.HPO_ID] for hpo_item in resources.hpo_csv()]
def test_get_full_result_log_when_all_exist(self, mock_hpo_csv): self._empty_hpo_buckets() hpos = resources.hpo_csv() hpo_0 = hpos[0] hpo_0_bucket = gcs_utils.get_hpo_bucket(hpo_0['hpo_id']) with open(FIVE_PERSONS_SUCCESS_RESULT_CSV, 'r') as fp: gcs_utils.upload_object(hpo_0_bucket, common.RESULT_CSV, fp) with open(FIVE_PERSONS_SUCCESS_RESULT_NO_HPO_JSON, 'r') as fp: hpo_0_expected_items = json.load(fp) for item in hpo_0_expected_items: item['hpo_id'] = hpo_0['hpo_id'] hpo_1 = hpos[1] hpo_1_bucket = gcs_utils.get_hpo_bucket(hpo_1['hpo_id']) with open(ALL_FILES_UNPARSEABLE_VALIDATION_RESULT, 'r') as fp: gcs_utils.upload_object(hpo_1_bucket, common.RESULT_CSV, fp) with open(ALL_FILES_UNPARSEABLE_VALIDATION_RESULT_NO_HPO_JSON, 'r') as fp: hpo_1_expected_items = json.load(fp) for item in hpo_1_expected_items: item['hpo_id'] = hpo_1['hpo_id'] expected = hpo_0_expected_items + hpo_1_expected_items actual = main.get_full_result_log() self.assertResultLogItemsEqual(expected, actual)
def get_hpo_name(hpo_id): hpo_name = "HPO" hpo_list_of_dicts = resources.hpo_csv() for hpo_dict in hpo_list_of_dicts: if hpo_dict['hpo_id'].lower() == hpo_id.lower(): hpo_name = hpo_dict['name'] return hpo_name return hpo_name
def validate_all_hpos(): """ validation end point for all hpo_ids """ for item in resources.hpo_csv(): hpo_id = item['hpo_id'] process_hpo(hpo_id) return 'validation done!'
def get_hpo_site_names(): """ Return a list of hpo site ids. :return: A list of string hpo site ids """ hpo_ids = [] for site in rc.hpo_csv(): hpo_ids.append(site[consts.HPO_ID]) return hpo_ids
def test_convert_to_bq_string(self): hpo_rdr_mapping_list = gen_ext.get_hpo_and_rdr_mappings() hpo_bq_list = [] for hpo in resources.hpo_csv(): hpo_bq_list.append(self.bq_string.format(hpo_name=hpo["hpo_id"])) hpo_bq_list.append('("{rdr}", "{ppi_pm}")'.format( rdr=gen_ext.RDR, ppi_pm=gen_ext.PPI_PM)) expected = ', '.join(hpo_bq_list) actual = gen_ext.convert_to_bq_string(hpo_rdr_mapping_list) self.assertEqual(len(actual), len(expected))
def generate_site_mappings(): """ Generates the mapping table for the site names and the masked names :return: returns dict with key: hpo_id, value: rand int """ hpo_list = resources.hpo_csv() rand_list = random.sample(range(100, 999), len(hpo_list)) mapping_dict = dict() for i, hpo_dict in enumerate(hpo_list): mapping_dict[hpo_dict["hpo_id"]] = rand_list[i] return mapping_dict
def find_hpo(hpo_id, hpo_name): """ Finds if the HPO are already available in hpo.csv :param hpo_id: hpo identifier :param hpo_name: HPO name :return: """ hpos = resources.hpo_csv() for hpo in hpos: if hpo['hpo_id'] == hpo_id or hpo['name'] == hpo_name: return hpo return None
def test_get_full_result_log_when_one_does_not_exist(self, mock_hpo_csv): self._empty_hpo_buckets() hpos = resources.hpo_csv() hpo_0 = hpos[0] hpo_0_bucket = gcs_utils.get_hpo_bucket(hpo_0['hpo_id']) with open(FIVE_PERSONS_SUCCESS_RESULT_CSV, 'r') as fp: gcs_utils.upload_object(hpo_0_bucket, common.RESULT_CSV, fp) with open(FIVE_PERSONS_SUCCESS_RESULT_NO_HPO_JSON, 'r') as fp: expected = json.load(fp) for item in expected: item['hpo_id'] = hpo_0['hpo_id'] actual = main.get_full_result_log() self.assertResultLogItemsEqual(expected, actual)
def main(input_dataset_id, output_dataset_id, project_id, hpo_ids=None): """ Create a new CDM which is the union of all EHR datasets submitted by HPOs :param input_dataset_id identifies a dataset containing multiple CDMs, one for each HPO submission :param output_dataset_id identifies the dataset to store the new CDM in :param project_id: project containing the datasets :param hpo_ids: (optional) identifies HPOs to process, by default process all :returns: list of tables generated successfully """ logging.info('EHR union started') if hpo_ids is None: hpo_ids = [item['hpo_id'] for item in resources.hpo_csv()] # Create empty output tables to ensure proper schema, clustering, etc. for table in resources.CDM_TABLES: result_table = output_table_for(table) logging.info('Creating {dataset_id}.{table_id}...'.format( dataset_id=output_dataset_id, table_id=result_table)) bq_utils.create_standard_table(table, result_table, drop_existing=True, dataset_id=output_dataset_id) # Create mapping tables for domain_table in cdm.tables_to_map(): logging.info( 'Mapping {domain_table}...'.format(domain_table=domain_table)) mapping(domain_table, hpo_ids, input_dataset_id, output_dataset_id, project_id) # Load all tables with union of submitted tables for table_name in resources.CDM_TABLES: logging.info( 'Creating union of table {table}...'.format(table=table_name)) load(table_name, hpo_ids, input_dataset_id, output_dataset_id) logging.info('Creation of Unioned EHR complete') # create person mapping table domain_table = PERSON_TABLE logging.info('Mapping {domain_table}...'.format(domain_table=domain_table)) mapping(domain_table, hpo_ids, input_dataset_id, output_dataset_id, project_id) logging.info('Starting process for Person to Observation') # Map and move EHR person records into four rows in observation, one each for race, ethnicity, dob and gender map_ehr_person_to_observation(output_dataset_id) move_ehr_person_to_observation(output_dataset_id) logging.info('Completed Person to Observation')
def render(): """ Render cron file :return: a str representation of the cron file """ j2_env = jinja2.Environment( loader=jinja2.FileSystemLoader(resources.TEMPLATES_PATH)) tpl = j2_env.get_template(resources.CRON_TPL_YAML) # TODO obtain cron urls from validation.main/app_base.yaml instead of through template hpos = resources.hpo_csv() yesterday = get_yesterday_expr() result = tpl.render(hpos=hpos, yesterday=yesterday) return result
def validate_all_hpos(): """ validation end point for individual hpo_ids """ for item in resources.hpo_csv(): hpo_id = item['hpo_id'] try: run_validation(hpo_id) except BucketDoesNotExistError as bucket_error: bucket = bucket_error.bucket logging.warn( 'Bucket `{bucket}` configured for hpo_id `hpo_id` does not exist' .format(bucket=bucket, hpo_id=hpo_id)) return 'validation done!'
def get_hpo_name(hpo_id): hpo_list_of_dicts = resources.hpo_csv() for hpo_dict in hpo_list_of_dicts: if hpo_dict['hpo_id'].lower() == hpo_id.lower(): return hpo_dict['name'] raise ValueError('%s is not a valid hpo_id' % hpo_id)
""" Functionality: This script loops through achilles_heel_results table of each site and gets the 5 most common heel errors per site and stores them in a csv file. """ import resources import bq_utils import os import json import csv from google.appengine.api.app_identity import app_identity hpo_ids = [item['hpo_id'] for item in resources.hpo_csv()] HEEL_ERRORS_JSON = 'heel_errors.json' HEEL_ERRORS_CSV = 'heel_errors.csv' heel_error_query = '''select '{hpo_id}' as hpo_name, analysis_id, achilles_heel_warning as heel_error, rule_id, record_count FROM `{app_id}.{dataset_id}.{hpo_id}_achilles_heel_results` WHERE achilles_heel_warning like 'ERROR:%' order by record_count desc limit 5''' def parse_json_csv(): """ :param key: hard coded value used to select the keys of json files in first iteration of the for loop as the header :return: None """ input_file = open(HEEL_ERRORS_JSON) parsed_json = json.load(input_file)
def _empty_hpo_buckets(self, mock_hpo_csv): hpos = resources.hpo_csv() for hpo in hpos: hpo_id = hpo['hpo_id'] bucket = gcs_utils.get_hpo_bucket(hpo_id) self._empty_bucket(bucket)
""" Fetch the most prevalent achilles heel errors in a dataset """ import csv import json import os from functools import partial import app_identity import bq_utils import common import resources from io import open HPO_ID_LIST = [item['hpo_id'] for item in resources.hpo_csv()] JSON = 'json' CSV = 'csv' OUTPUT_FORMATS = [CSV, JSON] # Query-related data variables # # Output field names FIELD_DATASET_NAME = 'dataset_name' FIELD_ANALYSIS_ID = 'analysis_id' FIELD_ACHILLES_HEEL_WARNING = 'achilles_heel_warning' FIELD_HEEL_ERROR = 'heel_error' FIELD_RULE_ID = 'rule_id' FIELD_RECORD_COUNT = 'record_count' FIELDS = [ FIELD_DATASET_NAME, FIELD_ANALYSIS_ID, FIELD_HEEL_ERROR, FIELD_RULE_ID, FIELD_RECORD_COUNT