def get_hpo_ids(): """ Get identifiers for all HPO sites :return: A list of HPO ids """ return [hpo_item[consts.HPO_ID] for hpo_item in bq_utils.get_hpo_info()]
def get_hpo_ids(): """ Retrieves list of hpo_ids from lookup_tables.hpo_site_id_mappings :return: List of hpo_ids """ return [item['hpo_id'] for item in bq_utils.get_hpo_info()]
def add_hpo_site_mappings_file_df(hpo_id, hpo_name, org_id, display_order): """ Creates dataframe with hpo_id, hpo_name, org_id, display_order :param hpo_id: hpo_ identifier :param hpo_name: name of the hpo :param org_id: hpo organization identifier :param display_order: index number in which hpo should be added in table :raises ValueError if hpo_id already exists in the lookup table """ hpo_table = bq_utils.get_hpo_info() hpo_table_df = pd.DataFrame(hpo_table) if hpo_id in hpo_table_df['hpo_id'] or hpo_name in hpo_table_df['name']: raise ValueError( f"{hpo_id}/{hpo_name} already exists in site lookup table") hpo_file_df = pd.read_csv(resources.hpo_site_mappings_path) verify_hpo_mappings_up_to_date(hpo_file_df, hpo_table_df) if display_order is None: display_order = hpo_file_df['Display_Order'].max() + 1 hpo_file_df.loc[hpo_file_df['Display_Order'] >= display_order, 'Display_Order'] += 1 hpo_file_df.loc['-1'] = [org_id, hpo_id, hpo_name, display_order] LOGGER.info(f'Added new entry for hpo_id {hpo_id} to ' f'config/hpo_site_mappings.csv at position {display_order}. ' f'Please upload to curation-devops repo.') return hpo_file_df.sort_values(by='Display_Order')
def validate_all_hpos(): """ validation end point for all hpo_ids """ for item in bq_utils.get_hpo_info(): hpo_id = item['hpo_id'] process_hpo(hpo_id) return 'validation done!'
def test_convert_to_bq_string(self, mock_hpo_list): mock_hpo_list.return_value = self.hpo_list hpo_rdr_mapping_list = gen_ext.get_hpo_and_rdr_mappings() hpo_bq_list = [] for hpo in bq_utils.get_hpo_info(): hpo_bq_list.append(self.bq_string.format(hpo_name=hpo["hpo_id"])) hpo_bq_list.append(f'("{gen_ext.RDR}", "{gen_ext.PPI_PM}")') expected = ', '.join(hpo_bq_list) actual = gen_ext.convert_to_bq_string(hpo_rdr_mapping_list) self.assertEqual(len(actual), len(expected))
def get_hpo_site_names(): """ Return a list of hpo site ids. :return: A list of string hpo site ids """ hpo_ids = [] for site in bq_utils.get_hpo_info(): hpo_ids.append(site[consts.HPO_ID]) return hpo_ids
def generate_site_mappings(): """ Generates the mapping table for the site names and the masked names :return: returns dict with key: hpo_id, value: rand int """ hpo_list = bq_utils.get_hpo_info() rand_list = random.sample(range(100, 999), len(hpo_list)) mapping_dict = dict() for i, hpo_dict in enumerate(hpo_list): mapping_dict[hpo_dict["hpo_id"]] = rand_list[i] return mapping_dict
def find_hpo(hpo_id, hpo_name): """ Finds if the HPO is already available in lookup_tables.hpo_site_id_mappings :param hpo_id: hpo identifier :param hpo_name: HPO name :return: """ hpos = bq_utils.get_hpo_info() for hpo in hpos: if hpo['hpo_id'] == hpo_id or hpo['name'] == hpo_name: return hpo return None
def main(input_dataset_id, output_dataset_id, project_id, hpo_ids_ex=None): """ Create a new CDM which is the union of all EHR datasets submitted by HPOs :param input_dataset_id identifies a dataset containing multiple CDMs, one for each HPO submission :param output_dataset_id identifies the dataset to store the new CDM in :param project_id: project containing the datasets :param hpo_ids_ex: (optional) list that identifies HPOs not to process, by default process all :returns: list of tables generated successfully """ client = get_client(project_id) logging.info('EHR union started') # Get all hpo_ids. hpo_ids = [item['hpo_id'] for item in bq_utils.get_hpo_info()] if hpo_ids_ex: hpo_ids = [hpo_id for hpo_id in hpo_ids if hpo_id not in hpo_ids_ex] # Create empty output tables to ensure proper schema, clustering, etc. for table in resources.CDM_TABLES: result_table = output_table_for(table) logging.info(f'Creating {output_dataset_id}.{result_table}...') bq_utils.create_standard_table(table, result_table, drop_existing=True, dataset_id=output_dataset_id) # Create mapping tables for domain_table in cdm.tables_to_map(): logging.info(f'Mapping {domain_table}...') mapping(domain_table, hpo_ids, input_dataset_id, output_dataset_id, project_id, client) # Load all tables with union of submitted tables for table_name in resources.CDM_TABLES: logging.info(f'Creating union of table {table_name}...') load(table_name, hpo_ids, input_dataset_id, output_dataset_id) logging.info('Creation of Unioned EHR complete') # create person mapping table domain_table = common.PERSON logging.info(f'Mapping {domain_table}...') mapping(domain_table, hpo_ids, input_dataset_id, output_dataset_id, project_id, client) logging.info('Starting process for Person to Observation') # Map and move EHR person records into four rows in observation, one each for race, ethnicity, dob and gender map_ehr_person_to_observation(output_dataset_id) move_ehr_person_to_observation(output_dataset_id) logging.info('Completed Person to Observation')
def render(): """ Render cron file :return: a str representation of the cron file """ j2_env = jinja2.Environment( loader=jinja2.FileSystemLoader(resources.TEMPLATES_PATH)) tpl = j2_env.get_template(resources.CRON_TPL_YAML) # TODO obtain cron urls from validation.main/app_base.yaml instead of through template hpos = bq_utils.get_hpo_info() yesterday = get_yesterday_expr() result = tpl.render(hpos=hpos, yesterday=yesterday) return result
def test_get_hpo_info(self): hpo_info = bq_utils.get_hpo_info() self.assertGreater(len(hpo_info), 0)
def get_hpo_name(hpo_id): hpo_list_of_dicts = bq_utils.get_hpo_info() for hpo_dict in hpo_list_of_dicts: if hpo_dict['hpo_id'].lower() == hpo_id.lower(): return hpo_dict['name'] raise ValueError('%s is not a valid hpo_id' % hpo_id)
def is_hpo_id(hpo_id): return hpo_id in [item['hpo_id'] for item in bq_utils.get_hpo_info()]