Example #1
0
def merge(dataset_id, project_id):
    """merge hpo ehr data

    :dataset_id: source and target dataset
    :project_id: project in which everything happens
    :returns: list of tables generated successfully

    """
    logging.info('Starting merge')
    existing_tables = bq_utils.list_dataset_contents(dataset_id)
    hpos_to_merge = []
    hpos_with_visit = []
    for item in resources.hpo_csv():
        hpo_id = item['hpo_id']
        if hpo_id + '_person' in existing_tables:
            hpos_to_merge.append(hpo_id)
        if hpo_id + '_visit_occurrence' in existing_tables:
            hpos_with_visit.append(hpo_id)
    logging.info('HPOs to merge: %s' % hpos_to_merge)
    logging.info('HPOs with visit_occurrence: %s' % hpos_with_visit)
    create_mapping_table(hpos_with_visit, project_id, dataset_id)

    # before loading [drop and] create all tables to ensure they are set up properly
    for cdm_file_name in common.CDM_FILES:
        cdm_table_name = cdm_file_name.split('.')[0]
        result_table = result_table_for(cdm_table_name)
        bq_utils.create_standard_table(cdm_table_name,
                                       result_table,
                                       drop_existing=True)

    jobs_to_wait_on = []
    for table_name in common.CDM_TABLES:
        q = construct_query(table_name, hpos_to_merge, hpos_with_visit,
                            project_id, dataset_id)
        logging.info('Merging table: ' + table_name)
        result_table = result_table_for(table_name)
        query_result = query(q,
                             destination_table_id=result_table,
                             write_disposition='WRITE_TRUNCATE')
        query_job_id = query_result['jobReference']['jobId']
        jobs_to_wait_on.append(query_job_id)

    incomplete_jobs = bq_utils.wait_on_jobs(jobs_to_wait_on)
    if len(incomplete_jobs) == 0:
        tables_created = []
        for job_id in jobs_to_wait_on:
            job_details = bq_utils.get_job_details(job_id)
            status = job_details['status']
            table = job_details['configuration']['query']['destinationTable'][
                'tableId']
            if 'errors' in status:
                logging.error('Job ID %s errors: %s' %
                              (job_id, status['errors']))
            else:
                tables_created.append(table)
        return tables_created
    else:
        message = "Merge failed because job id(s) %s did not complete." % incomplete_jobs
        logging.error(message)
        raise RuntimeError(message)
Example #2
0
def get_full_result_log():
    full_log = []
    for hpo in resources.hpo_csv():
        hpo_id = hpo['hpo_id']
        hpo_bucket = gcs_utils.get_hpo_bucket(hpo_id)

        try:
            # TODO : figure out possible errors and catch specific bucket inexistence error
            obj_metadata = gcs_utils.get_metadata(hpo_bucket, RESULT_CSV)
        except:
            logging.warning(
                'skipping hpo {}. bucket does not exist.'.format(hpo))
            continue

        if obj_metadata is None:
            logging.info('%s was not found in %s' % (RESULT_CSV, hpo_bucket))
        else:
            hpo_result = gcs_utils.get_object(hpo_bucket, RESULT_CSV)
            hpo_result_file = StringIO.StringIO(hpo_result)
            hpo_result_items = resources._csv_file_to_list(hpo_result_file)
            result_objects = map(
                lambda item: hpo_log_item_to_obj(hpo_id, item),
                hpo_result_items)
            full_log.extend(result_objects)
    return full_log
 def setUp(self):
     self.project_id = 'project_id'
     self.dataset_id = 'dataset_id'
     self.obs_fields = [{
         "type":
         "integer",
         "name":
         "observation_id",
         "mode":
         "nullable",
         "description":
         "The observation_id used in the observation table."
     }, {
         "type":
         "string",
         "name":
         "src_id",
         "mode":
         "nullable",
         "description":
         "The provenance of the data associated with the observation_id."
     }]
     self.hpo_list = resources.hpo_csv()
     self.mapping_tables = [
         gen_ext.MAPPING_PREFIX + cdm_table
         for cdm_table in common.AOU_REQUIRED if cdm_table not in
         [common.PERSON, common.DEATH, common.FACT_RELATIONSHIP]
     ]
     self.bq_string = '("{hpo_name}", "EHR site nnn")'
Example #4
0
def main(input_dataset_id, output_dataset_id, project_id, hpo_ids=None):
    """
    Create a new CDM which is the union of all EHR datasets submitted by HPOs

    :param input_dataset_id identifies a dataset containing multiple CDMs, one for each HPO submission
    :param output_dataset_id identifies the dataset to store the new CDM in
    :param project_id: project containing the datasets
    :param hpo_ids: (optional) identifies HPOs to process, by default process all
    :returns: list of tables generated successfully
    """
    logging.info('EHR union started')
    if hpo_ids is None:
        hpo_ids = [item['hpo_id'] for item in resources.hpo_csv()]

    # Create empty output tables to ensure proper schema, clustering, etc.
    for table in common.CDM_TABLES:
        result_table = output_table_for(table)
        logging.info('Creating {dataset_id}.{table_id}...'.format(dataset_id=output_dataset_id, table_id=result_table))
        bq_utils.create_standard_table(table, result_table, drop_existing=True, dataset_id=output_dataset_id)

    # Create mapping tables
    for domain_table in tables_to_map():
        logging.info('Mapping {domain_table}...'.format(domain_table=domain_table))
        mapping(domain_table, hpo_ids, input_dataset_id, output_dataset_id, project_id)

    # Load all tables with union of submitted tables
    for table_name in common.CDM_TABLES:
        logging.info('Creating union of table {table}...'.format(table=table_name))
        load(table_name, hpo_ids, input_dataset_id, output_dataset_id)
Example #5
0
def get_hpo_ids():
    """
    Get identifiers for all HPO sites

    :return: A list of HPO ids
    """
    return [hpo_item[consts.HPO_ID] for hpo_item in resources.hpo_csv()]
Example #6
0
    def test_get_full_result_log_when_all_exist(self, mock_hpo_csv):
        self._empty_hpo_buckets()
        hpos = resources.hpo_csv()
        hpo_0 = hpos[0]
        hpo_0_bucket = gcs_utils.get_hpo_bucket(hpo_0['hpo_id'])
        with open(FIVE_PERSONS_SUCCESS_RESULT_CSV, 'r') as fp:
            gcs_utils.upload_object(hpo_0_bucket, common.RESULT_CSV, fp)

        with open(FIVE_PERSONS_SUCCESS_RESULT_NO_HPO_JSON, 'r') as fp:
            hpo_0_expected_items = json.load(fp)
            for item in hpo_0_expected_items:
                item['hpo_id'] = hpo_0['hpo_id']

        hpo_1 = hpos[1]
        hpo_1_bucket = gcs_utils.get_hpo_bucket(hpo_1['hpo_id'])
        with open(ALL_FILES_UNPARSEABLE_VALIDATION_RESULT, 'r') as fp:
            gcs_utils.upload_object(hpo_1_bucket, common.RESULT_CSV, fp)

        with open(ALL_FILES_UNPARSEABLE_VALIDATION_RESULT_NO_HPO_JSON,
                  'r') as fp:
            hpo_1_expected_items = json.load(fp)
            for item in hpo_1_expected_items:
                item['hpo_id'] = hpo_1['hpo_id']

        expected = hpo_0_expected_items + hpo_1_expected_items
        actual = main.get_full_result_log()
        self.assertResultLogItemsEqual(expected, actual)
Example #7
0
def get_hpo_name(hpo_id):
    hpo_name = "HPO"
    hpo_list_of_dicts = resources.hpo_csv()
    for hpo_dict in hpo_list_of_dicts:
        if hpo_dict['hpo_id'].lower() == hpo_id.lower():
            hpo_name = hpo_dict['name']
            return hpo_name
    return hpo_name
Example #8
0
def validate_all_hpos():
    """
    validation end point for all hpo_ids
    """
    for item in resources.hpo_csv():
        hpo_id = item['hpo_id']
        process_hpo(hpo_id)
    return 'validation done!'
Example #9
0
def get_hpo_site_names():
    """
    Return a list of hpo site ids.

    :return:  A list of string hpo site ids
    """
    hpo_ids = []
    for site in rc.hpo_csv():
        hpo_ids.append(site[consts.HPO_ID])
    return hpo_ids
 def test_convert_to_bq_string(self):
     hpo_rdr_mapping_list = gen_ext.get_hpo_and_rdr_mappings()
     hpo_bq_list = []
     for hpo in resources.hpo_csv():
         hpo_bq_list.append(self.bq_string.format(hpo_name=hpo["hpo_id"]))
     hpo_bq_list.append('("{rdr}", "{ppi_pm}")'.format(
         rdr=gen_ext.RDR, ppi_pm=gen_ext.PPI_PM))
     expected = ', '.join(hpo_bq_list)
     actual = gen_ext.convert_to_bq_string(hpo_rdr_mapping_list)
     self.assertEqual(len(actual), len(expected))
def generate_site_mappings():
    """
    Generates the mapping table for the site names and the masked names
    :return: returns dict with key: hpo_id, value: rand int
    """
    hpo_list = resources.hpo_csv()
    rand_list = random.sample(range(100, 999), len(hpo_list))
    mapping_dict = dict()
    for i, hpo_dict in enumerate(hpo_list):
        mapping_dict[hpo_dict["hpo_id"]] = rand_list[i]
    return mapping_dict
Example #12
0
def find_hpo(hpo_id, hpo_name):
    """
    Finds if the HPO  are already available in hpo.csv
    :param hpo_id: hpo identifier
    :param hpo_name: HPO name
    :return:
    """
    hpos = resources.hpo_csv()
    for hpo in hpos:
        if hpo['hpo_id'] == hpo_id or hpo['name'] == hpo_name:
            return hpo
    return None
Example #13
0
 def test_get_full_result_log_when_one_does_not_exist(self, mock_hpo_csv):
     self._empty_hpo_buckets()
     hpos = resources.hpo_csv()
     hpo_0 = hpos[0]
     hpo_0_bucket = gcs_utils.get_hpo_bucket(hpo_0['hpo_id'])
     with open(FIVE_PERSONS_SUCCESS_RESULT_CSV, 'r') as fp:
         gcs_utils.upload_object(hpo_0_bucket, common.RESULT_CSV, fp)
     with open(FIVE_PERSONS_SUCCESS_RESULT_NO_HPO_JSON, 'r') as fp:
         expected = json.load(fp)
         for item in expected:
             item['hpo_id'] = hpo_0['hpo_id']
     actual = main.get_full_result_log()
     self.assertResultLogItemsEqual(expected, actual)
Example #14
0
def main(input_dataset_id, output_dataset_id, project_id, hpo_ids=None):
    """
    Create a new CDM which is the union of all EHR datasets submitted by HPOs

    :param input_dataset_id identifies a dataset containing multiple CDMs, one for each HPO submission
    :param output_dataset_id identifies the dataset to store the new CDM in
    :param project_id: project containing the datasets
    :param hpo_ids: (optional) identifies HPOs to process, by default process all
    :returns: list of tables generated successfully
    """
    logging.info('EHR union started')
    if hpo_ids is None:
        hpo_ids = [item['hpo_id'] for item in resources.hpo_csv()]

    # Create empty output tables to ensure proper schema, clustering, etc.
    for table in resources.CDM_TABLES:
        result_table = output_table_for(table)
        logging.info('Creating {dataset_id}.{table_id}...'.format(
            dataset_id=output_dataset_id, table_id=result_table))
        bq_utils.create_standard_table(table,
                                       result_table,
                                       drop_existing=True,
                                       dataset_id=output_dataset_id)

    # Create mapping tables
    for domain_table in cdm.tables_to_map():
        logging.info(
            'Mapping {domain_table}...'.format(domain_table=domain_table))
        mapping(domain_table, hpo_ids, input_dataset_id, output_dataset_id,
                project_id)

    # Load all tables with union of submitted tables
    for table_name in resources.CDM_TABLES:
        logging.info(
            'Creating union of table {table}...'.format(table=table_name))
        load(table_name, hpo_ids, input_dataset_id, output_dataset_id)

    logging.info('Creation of Unioned EHR complete')

    # create person mapping table
    domain_table = PERSON_TABLE
    logging.info('Mapping {domain_table}...'.format(domain_table=domain_table))
    mapping(domain_table, hpo_ids, input_dataset_id, output_dataset_id,
            project_id)

    logging.info('Starting process for Person to Observation')
    # Map and move EHR person records into four rows in observation, one each for race, ethnicity, dob and gender
    map_ehr_person_to_observation(output_dataset_id)
    move_ehr_person_to_observation(output_dataset_id)

    logging.info('Completed Person to Observation')
def render():
    """
    Render cron file

    :return: a str representation of the cron file
    """
    j2_env = jinja2.Environment(
        loader=jinja2.FileSystemLoader(resources.TEMPLATES_PATH))
    tpl = j2_env.get_template(resources.CRON_TPL_YAML)
    # TODO obtain cron urls from validation.main/app_base.yaml instead of through template
    hpos = resources.hpo_csv()
    yesterday = get_yesterday_expr()
    result = tpl.render(hpos=hpos, yesterday=yesterday)
    return result
Example #16
0
def validate_all_hpos():
    """
    validation end point for individual hpo_ids
    """
    for item in resources.hpo_csv():
        hpo_id = item['hpo_id']
        try:
            run_validation(hpo_id)
        except BucketDoesNotExistError as bucket_error:
            bucket = bucket_error.bucket
            logging.warn(
                'Bucket `{bucket}` configured for hpo_id `hpo_id` does not exist'
                .format(bucket=bucket, hpo_id=hpo_id))
    return 'validation done!'
Example #17
0
def get_hpo_name(hpo_id):
    hpo_list_of_dicts = resources.hpo_csv()
    for hpo_dict in hpo_list_of_dicts:
        if hpo_dict['hpo_id'].lower() == hpo_id.lower():
            return hpo_dict['name']
    raise ValueError('%s is not a valid hpo_id' % hpo_id)
Example #18
0
"""
Functionality: This script loops through achilles_heel_results table of each site and gets the 5 most common heel errors
per site and stores them in a csv file.
"""
import resources
import bq_utils
import os
import json
import csv
from google.appengine.api.app_identity import app_identity

hpo_ids = [item['hpo_id'] for item in resources.hpo_csv()]
HEEL_ERRORS_JSON = 'heel_errors.json'
HEEL_ERRORS_CSV = 'heel_errors.csv'
heel_error_query = '''select '{hpo_id}' as hpo_name,
            analysis_id, 
            achilles_heel_warning as heel_error,
            rule_id,
            record_count
            FROM `{app_id}.{dataset_id}.{hpo_id}_achilles_heel_results`
            WHERE achilles_heel_warning like 'ERROR:%'
            order by record_count desc limit 5'''


def parse_json_csv():
    """
    :param key: hard coded value used to select the keys of json files in first iteration of the for loop as the header
    :return: None
    """
    input_file = open(HEEL_ERRORS_JSON)
    parsed_json = json.load(input_file)
Example #19
0
 def _empty_hpo_buckets(self, mock_hpo_csv):
     hpos = resources.hpo_csv()
     for hpo in hpos:
         hpo_id = hpo['hpo_id']
         bucket = gcs_utils.get_hpo_bucket(hpo_id)
         self._empty_bucket(bucket)
Example #20
0
"""
Fetch the most prevalent achilles heel errors in a dataset
"""
import csv
import json
import os
from functools import partial

import app_identity

import bq_utils
import common
import resources
from io import open

HPO_ID_LIST = [item['hpo_id'] for item in resources.hpo_csv()]
JSON = 'json'
CSV = 'csv'
OUTPUT_FORMATS = [CSV, JSON]

# Query-related data variables #
# Output field names
FIELD_DATASET_NAME = 'dataset_name'
FIELD_ANALYSIS_ID = 'analysis_id'
FIELD_ACHILLES_HEEL_WARNING = 'achilles_heel_warning'
FIELD_HEEL_ERROR = 'heel_error'
FIELD_RULE_ID = 'rule_id'
FIELD_RECORD_COUNT = 'record_count'
FIELDS = [
    FIELD_DATASET_NAME, FIELD_ANALYSIS_ID, FIELD_HEEL_ERROR, FIELD_RULE_ID,
    FIELD_RECORD_COUNT