def mapping_query(domain_table):
    """
    Returns query used to get mapping of all records from RDR combined with EHR records of consented participants

    :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence')
    :return:
    """

    if combine_consts.PERSON_ID in [
            field['name'] for field in resources.fields_for(domain_table)
    ]:
        return combine_consts.MAPPING_QUERY_WITH_PERSON_CHECK.format(
            rdr_dataset_id=bq_utils.get_rdr_dataset_id(),
            ehr_dataset_id=bq_utils.get_dataset_id(),
            ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(),
            domain_table=domain_table,
            mapping_constant=common.RDR_ID_CONSTANT,
            ehr_consent_table_id=combine_consts.EHR_CONSENT_TABLE_ID)
    else:
        return combine_consts.MAPPING_QUERY_WITHOUT_PERSON_CHECK.format(
            rdr_dataset_id=bq_utils.get_rdr_dataset_id(),
            ehr_dataset_id=bq_utils.get_dataset_id(),
            ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(),
            domain_table=domain_table,
            mapping_constant=common.RDR_ID_CONSTANT)
 def _all_rdr_records_included(self):
     """
     All rdr records are included whether or not there is corresponding ehr record
     """
     for domain_table in DOMAIN_TABLES:
         mapping_table = mapping_table_for(domain_table)
         q = '''SELECT rt.{domain_table}_id as id
            FROM {rdr_dataset_id}.{domain_table} rt
            LEFT JOIN {ehr_rdr_dataset_id}.{mapping_table} m
            ON rt.{domain_table}_id = m.src_{domain_table}_id
            WHERE
              m.{domain_table}_id IS NULL
            OR NOT EXISTS
              (SELECT 1 FROM {ehr_rdr_dataset_id}.{domain_table} t
               WHERE t.{domain_table}_id = m.{domain_table}_id)'''.format(
             domain_table=domain_table,
             rdr_dataset_id=bq_utils.get_rdr_dataset_id(),
             ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(),
             mapping_table=mapping_table)
         response = bq_utils.query(q)
         rows = test_util.response2rows(response)
         self.assertEqual(
             0, len(rows),
             "RDR records should map to records in mapping and combined tables"
         )
Exemple #3
0
def fact_relationship_query():
    """
    Load fact_relationship, using mapped IDs based on domain concept in fact 1 and fact 2
    :return:
    """
    return '''
    SELECT 
      fr.domain_concept_id_1 AS domain_concept_id_1,
      CASE
          WHEN domain_concept_id_1 = 21 
            THEN m1.measurement_id 
          WHEN domain_concept_id_1 = 27
            THEN o1.observation_id
      END AS fact_id_1,
      fr.domain_concept_id_2,
      CASE 
          WHEN domain_concept_id_2 = 21 
            THEN m2.measurement_id
          WHEN domain_concept_id_2 = 27
            THEN o2.observation_id
      END AS fact_id_2,
      fr.relationship_concept_id AS relationship_concept_id
    FROM {rdr_dataset_id}.fact_relationship fr
      LEFT JOIN {combined_dataset_id}.{mapping_measurement} m1
        ON m1.src_measurement_id = fr.fact_id_1 AND fr.domain_concept_id_1=21
      LEFT JOIN {combined_dataset_id}.{mapping_observation} o1
        ON o1.src_observation_id = fr.fact_id_1 AND fr.domain_concept_id_1=27
      LEFT JOIN {combined_dataset_id}.{mapping_measurement} m2
        ON m2.src_measurement_id = fr.fact_id_2 AND fr.domain_concept_id_2=21
      LEFT JOIN {combined_dataset_id}.{mapping_observation} o2
        ON o2.src_observation_id = fr.fact_id_2 AND fr.domain_concept_id_2=27
    '''.format(rdr_dataset_id=bq_utils.get_rdr_dataset_id(),
               combined_dataset_id=bq_utils.get_ehr_rdr_dataset_id(),
               mapping_measurement=mapping_table_for('measurement'),
               mapping_observation=mapping_table_for('observation'))
def mapping_query(domain_table):
    """
    Returns query used to get mapping of all records from RDR combined with EHR records of consented participants

    :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence')
    :return:
    """

    return '''SELECT DISTINCT
          '{rdr_dataset_id}'  AS src_dataset_id,
          {domain_table}_id  AS src_{domain_table}_id,
          'rdr' as src_hpo_id,
          {domain_table}_id + {mapping_constant}  AS {domain_table}_id
        FROM {rdr_dataset_id}.{domain_table}

        UNION ALL

        SELECT DISTINCT
          '{ehr_dataset_id}'  AS src_dataset_id,
          t.{domain_table}_id AS src_{domain_table}_id,
          v.src_hpo_id AS src_hpo_id,
          t.{domain_table}_id  AS {domain_table}_id
        FROM {ehr_dataset_id}.{domain_table} t
        JOIN {ehr_dataset_id}._mapping_{domain_table}  v on t.{domain_table}_id = v.{domain_table}_id
        WHERE EXISTS
           (SELECT 1 FROM {ehr_rdr_dataset_id}.{ehr_consent_table_id} c
            WHERE t.person_id = c.person_id)
    '''.format(rdr_dataset_id=bq_utils.get_rdr_dataset_id(),
               ehr_dataset_id=bq_utils.get_dataset_id(),
               ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(),
               domain_table=domain_table,
               mapping_constant=common.RDR_ID_CONSTANT,
               ehr_consent_table_id=EHR_CONSENT_TABLE_ID)
Exemple #5
0
def mapping_query(domain_table):
    """
    Returns query used to get mapping of all records from RDR combined with EHR records of consented participants

    :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence')
    :return:
    """
    return '''
    WITH all_records AS
    (
        SELECT
          '{rdr_dataset_id}'  AS src_dataset_id, 
          {domain_table}_id AS src_{domain_table}_id
        FROM {rdr_dataset_id}.{domain_table}

        UNION ALL

        SELECT
          '{ehr_dataset_id}'  AS src_dataset_id, 
          {domain_table}_id AS src_{domain_table}_id
        FROM {ehr_dataset_id}.{domain_table} t
        WHERE EXISTS
           (SELECT 1 FROM {ehr_rdr_dataset_id}.{ehr_consent_table_id} c 
            WHERE t.person_id = c.person_id)
    )
    SELECT 
      ROW_NUMBER() OVER (ORDER BY src_dataset_id, src_{domain_table}_id) AS {domain_table}_id,
      src_dataset_id,
      src_{domain_table}_id
    FROM all_records
    '''.format(rdr_dataset_id=bq_utils.get_rdr_dataset_id(),
               ehr_dataset_id=bq_utils.get_dataset_id(),
               ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(),
               domain_table=domain_table,
               ehr_consent_table_id=EHR_CONSENT_TABLE_ID)
 def setUp(self):
     super(CombineEhrRdrTest, self).setUp()
     self.APP_ID = bq_utils.app_identity.get_application_id()
     self.ehr_dataset_id = bq_utils.get_dataset_id()
     self.rdr_dataset_id = bq_utils.get_rdr_dataset_id()
     self.combined_dataset_id = bq_utils.get_ehr_rdr_dataset_id()
     self.drc_bucket = gcs_utils.get_drc_bucket()
     test_util.delete_all_tables(self.combined_dataset_id)
Exemple #7
0
def create_cdm_tables():
    """
    Create all CDM tables

    Note: Recreates any existing tables
    """
    ehr_rdr_dataset_id = bq_utils.get_ehr_rdr_dataset_id()
    for table in common.CDM_TABLES:
        logger.debug('Creating table {dataset}.{table}...'.format(table=table, dataset=ehr_rdr_dataset_id))
        bq_utils.create_standard_table(table, table, drop_existing=True, dataset_id=ehr_rdr_dataset_id)
Exemple #8
0
def clean_ehr_rdr_dataset(project=None, dataset=None):
    if dataset is None or dataset == '' or dataset.isspace():
        dataset = bq_utils.get_ehr_rdr_dataset_id()
        LOGGER.info('Dataset is unspecified.  Using default value of:\t%s',
                    dataset)

    query_list = _gather_ehr_rdr_queries(project, dataset)

    LOGGER.info("Cleaning ehr_rdr_dataset")
    clean_engine.clean_dataset(project, dataset, query_list)
Exemple #9
0
def validate_pii():
    project = bq_utils.app_identity.get_application_id()
    combined_dataset = bq_utils.get_ehr_rdr_dataset_id()
    ehr_dataset = bq_utils.get_dataset_id()
    dest_dataset = bq_utils.get_validation_results_dataset_id()
    logging.info('Calling match_participants')
    _, errors = matching.match_participants(project, combined_dataset,
                                            ehr_dataset, dest_dataset)

    if errors > 0:
        logging.error("Errors encountered in validation process")

    return consts.VALIDATION_SUCCESS
def fact_relationship_query():
    """
    Load fact_relationship, using mapped IDs based on domain concept in fact 1 and fact 2
    :return:
    """
    return combine_consts.FACT_RELATIONSHIP_QUERY.format(
        rdr_dataset_id=bq_utils.get_rdr_dataset_id(),
        combined_dataset_id=bq_utils.get_ehr_rdr_dataset_id(),
        mapping_measurement=mapping_table_for('measurement'),
        ehr_dataset=bq_utils.get_dataset_id(),
        mapping_observation=mapping_table_for('observation'),
        measurement_domain_concept_id=common.MEASUREMENT_DOMAIN_CONCEPT_ID,
        observation_domain_concept_id=common.OBSERVATION_DOMAIN_CONCEPT_ID)
Exemple #11
0
def query(q, dst_table_id, write_disposition='WRITE_APPEND'):
    """
    Run query and block until job is done
    :param q: SQL statement
    :param dst_table_id: if set, output is saved in a table with the specified id
    :param write_disposition: WRITE_TRUNCATE, WRITE_EMPTY, or WRITE_APPEND (default, to preserve schema)
    """
    dst_dataset_id = bq_utils.get_ehr_rdr_dataset_id()
    query_job_result = bq_utils.query(q, destination_table_id=dst_table_id, write_disposition=write_disposition,
                                      destination_dataset_id=dst_dataset_id)
    query_job_id = query_job_result['jobReference']['jobId']
    incomplete_jobs = bq_utils.wait_on_jobs([query_job_id])
    if len(incomplete_jobs) > 0:
        raise bq_utils.BigQueryJobWaitError(incomplete_jobs)
def person_query(table_name):
    """
    Maps location and care_Site id in person table

    :return: query
    """
    ehr_rdr_dataset_id = bq_utils.get_ehr_rdr_dataset_id()
    columns, join_expression = join_expression_generator(
        table_name, ehr_rdr_dataset_id)
    return combine_consts.MAPPED_PERSON_QUERY.format(
        cols=columns,
        dataset=ehr_rdr_dataset_id,
        table=table_name,
        join_expr=join_expression)
Exemple #13
0
def get_dataset_and_project_names():
    """
    Get project and dataset names from environment variables.

    :return: A dictionary of dataset names and project name
    """
    project_and_dataset_names = dict()
    project_and_dataset_names[clean_cdr_consts.EHR_DATASET] = bq_utils.get_dataset_id()
    project_and_dataset_names[clean_cdr_consts.UNIONED_EHR_DATASET] = bq_utils.get_unioned_dataset_id()
    project_and_dataset_names[clean_cdr_consts.RDR_DATASET] = bq_utils.get_rdr_dataset_id()
    project_and_dataset_names[clean_cdr_consts.EHR_RDR_DATASET] = bq_utils.get_ehr_rdr_dataset_id()
    project_and_dataset_names[clean_cdr_consts.EHR_RDR_DE_IDENTIFIED] = bq_utils.get_combined_deid_dataset_id()
    project_and_dataset_names[clean_cdr_consts.PROJECT] = app_identity.get_application_id()

    return project_and_dataset_names
Exemple #14
0
def clean_ehr_rdr_dataset(project_id=None, dataset_id=None):
    """
    Run all clean rules defined for the ehr and rdr dataset.

    :param project_id:  Name of the BigQuery project.
    :param dataset_id:  Name of the dataset to clean
    """
    if dataset_id is None or dataset_id == '' or dataset_id.isspace():
        dataset_id = bq_utils.get_ehr_rdr_dataset_id()
        LOGGER.info('Dataset is unspecified.  Using default value of:\t%s', dataset_id)

    query_list = _gather_ehr_rdr_queries(project_id, dataset_id)

    LOGGER.info("Cleaning ehr_rdr_dataset")
    clean_engine.clean_dataset(project_id, dataset_id, query_list)
def copy_ehr_table(table):
    """
    Copy table from EHR (consenting participants only) to the combined dataset without regenerating ids

    Note: Overwrites if a table already exists
    """
    fields = resources.fields_for(table)
    field_names = [field['name'] for field in fields]
    if 'person_id' not in field_names:
        raise RuntimeError(
            'Cannot copy EHR table {table}. It is missing columns needed for consent filter'
            .format(table=table))
    q = combine_consts.COPY_EHR_QUERY.format(
        ehr_dataset_id=bq_utils.get_dataset_id(),
        table=table,
        ehr_consent_table_id=combine_consts.EHR_CONSENT_TABLE_ID,
        ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id())
    logger.debug('Query for {table} is `{q}`'.format(table=table, q=q))
    query(q, table)
def load_query(domain_table):
    """
    Returns query used to load a domain table

    :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence')
    :return:
    """
    rdr_dataset_id = bq_utils.get_rdr_dataset_id()
    ehr_dataset_id = bq_utils.get_dataset_id()
    ehr_rdr_dataset_id = bq_utils.get_ehr_rdr_dataset_id()
    mapping_table = mapping_table_for(domain_table)
    cols, join_expression = join_expression_generator(domain_table,
                                                      ehr_rdr_dataset_id)

    return combine_consts.LOAD_QUERY.format(
        cols=cols,
        domain_table=domain_table,
        rdr_dataset_id=rdr_dataset_id,
        ehr_dataset_id=ehr_dataset_id,
        mapping_table=mapping_table,
        join_expr=join_expression,
        ehr_rdr_dataset_id=ehr_rdr_dataset_id)
def fact_relationship_query():
    """
    Load fact_relationship, using mapped IDs based on domain concept in fact 1 and fact 2
    :return:
    """
    return '''
    SELECT
      fr.domain_concept_id_1 AS domain_concept_id_1,
      CASE
          WHEN domain_concept_id_1 = {measurement_domain_concept_id}
            THEN m1.measurement_id
          WHEN domain_concept_id_1 = {observation_domain_concept_id}
            THEN o1.observation_id
      END AS fact_id_1,
      fr.domain_concept_id_2,
      CASE
          WHEN domain_concept_id_2 = {measurement_domain_concept_id}
            THEN m2.measurement_id
          WHEN domain_concept_id_2 = {observation_domain_concept_id}
            THEN o2.observation_id
      END AS fact_id_2,
      fr.relationship_concept_id AS relationship_concept_id
    FROM {rdr_dataset_id}.fact_relationship fr
      LEFT JOIN {combined_dataset_id}.{mapping_measurement} m1
        ON m1.src_measurement_id = fr.fact_id_1 AND fr.domain_concept_id_1={measurement_domain_concept_id}
      LEFT JOIN {combined_dataset_id}.{mapping_observation} o1
        ON o1.src_observation_id = fr.fact_id_1 AND fr.domain_concept_id_1={observation_domain_concept_id}
      LEFT JOIN {combined_dataset_id}.{mapping_measurement} m2
        ON m2.src_measurement_id = fr.fact_id_2 AND fr.domain_concept_id_2={measurement_domain_concept_id}
      LEFT JOIN {combined_dataset_id}.{mapping_observation} o2
        ON o2.src_observation_id = fr.fact_id_2 AND fr.domain_concept_id_2={observation_domain_concept_id}
    '''.format(
        rdr_dataset_id=bq_utils.get_rdr_dataset_id(),
        combined_dataset_id=bq_utils.get_ehr_rdr_dataset_id(),
        mapping_measurement=mapping_table_for('measurement'),
        mapping_observation=mapping_table_for('observation'),
        measurement_domain_concept_id=common.MEASUREMENT_DOMAIN_CONCEPT_ID,
        observation_domain_concept_id=common.OBSERVATION_DOMAIN_CONCEPT_ID)
Exemple #18
0
def load_query(domain_table):
    """
    Returns query used to load a domain table

    :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence')
    :return:
    """
    rdr_dataset_id = bq_utils.get_rdr_dataset_id()
    ehr_dataset_id = bq_utils.get_dataset_id()
    ehr_rdr_dataset_id = bq_utils.get_ehr_rdr_dataset_id()
    mapping_table = mapping_table_for(domain_table)
    has_visit_occurrence_id = False
    id_col = '{domain_table}_id'.format(domain_table=domain_table)
    fields = resources.fields_for(domain_table)

    # Generate column expressions for select, ensuring that
    #  1) we get the record IDs from the mapping table and
    #  2) if there is a reference to `visit_occurrence` we get `visit_occurrence_id` from the mapping visit table
    col_exprs = []
    for field in fields:
        field_name = field['name']
        if field_name == id_col:
            # Use mapping for record ID column
            # m is an alias that should resolve to the associated mapping table
            col_expr = 'm.{field_name} '.format(field_name=field_name)
        elif field_name == VISIT_OCCURRENCE_ID:
            # Replace with mapped visit_occurrence_id
            # mv is an alias that should resolve to the mapping visit table
            # Note: This is only reached when domain_table != visit_occurrence
            col_expr = 'mv.' + VISIT_OCCURRENCE_ID
            has_visit_occurrence_id = True
        else:
            col_expr = field_name
        col_exprs.append(col_expr)
    cols = ',\n  '.join(col_exprs)

    visit_join_expr = ''
    if has_visit_occurrence_id:
        # Include a join to mapping visit table
        # Note: Using left join in order to keep records that aren't mapped to visits
        mv = mapping_table_for(VISIT_OCCURRENCE)
        visit_join_expr = '''
        LEFT JOIN {ehr_rdr_dataset_id}.{mapping_visit_occurrence} mv 
          ON t.visit_occurrence_id = mv.src_visit_occurrence_id
         AND m.src_dataset_id = mv.src_dataset_id'''.format(
            ehr_rdr_dataset_id=ehr_rdr_dataset_id, mapping_visit_occurrence=mv)

    return '''
    SELECT {cols} 
    FROM {rdr_dataset_id}.{domain_table} t 
      JOIN {ehr_rdr_dataset_id}.{mapping_table} m
        ON t.{domain_table}_id = m.src_{domain_table}_id {visit_join_expr}
    WHERE m.src_dataset_id = '{rdr_dataset_id}'
    
    UNION ALL
    
    SELECT {cols} 
    FROM {ehr_dataset_id}.{domain_table} t 
      JOIN {ehr_rdr_dataset_id}.{mapping_table} m
        ON t.{domain_table}_id = m.src_{domain_table}_id {visit_join_expr}
    WHERE m.src_dataset_id = '{ehr_dataset_id}'
    '''.format(cols=cols,
               domain_table=domain_table,
               rdr_dataset_id=rdr_dataset_id,
               ehr_dataset_id=ehr_dataset_id,
               mapping_table=mapping_table,
               visit_join_expr=visit_join_expr,
               ehr_rdr_dataset_id=ehr_rdr_dataset_id)