Esempio n. 1
0
def mapping_query(domain_table):
    """
    Returns query used to get mapping of all records from RDR combined with EHR records of consented participants

    :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence')
    :return:
    """

    if combine_consts.PERSON_ID in [
            field['name'] for field in resources.fields_for(domain_table)
    ]:
        return combine_consts.MAPPING_QUERY_WITH_PERSON_CHECK.format(
            rdr_dataset_id=bq_utils.get_rdr_dataset_id(),
            ehr_dataset_id=bq_utils.get_dataset_id(),
            combined_dataset_id=bq_utils.get_combined_dataset_id(),
            domain_table=domain_table,
            mapping_constant=common.RDR_ID_CONSTANT,
            ehr_consent_table_id=combine_consts.EHR_CONSENT_TABLE_ID)
    else:
        return combine_consts.MAPPING_QUERY_WITHOUT_PERSON_CHECK.format(
            rdr_dataset_id=bq_utils.get_rdr_dataset_id(),
            ehr_dataset_id=bq_utils.get_dataset_id(),
            combined_dataset_id=bq_utils.get_combined_dataset_id(),
            domain_table=domain_table,
            mapping_constant=common.RDR_ID_CONSTANT)
Esempio n. 2
0
 def _all_rdr_records_included(self):
     """
     All rdr records are included whether or not there is corresponding ehr record
     """
     for domain_table in DOMAIN_TABLES:
         mapping_table = mapping_table_for(domain_table)
         query = (
             'SELECT rt.{domain_table}_id as id '
             'FROM `{rdr_dataset_id}.{domain_table}` AS rt '
             'LEFT JOIN `{combined_dataset_id}.{mapping_table}` AS m '
             'ON rt.{domain_table}_id = m.src_{domain_table}_id '
             'WHERE '
             '  m.{domain_table}_id IS NULL '
             'OR NOT EXISTS '
             ' (SELECT 1 FROM `{combined_dataset_id}.{domain_table}` AS t '
             '  WHERE t.{domain_table}_id = m.{domain_table}_id)').format(
                 domain_table=domain_table,
                 rdr_dataset_id=bq_utils.get_rdr_dataset_id(),
                 combined_dataset_id=bq_utils.get_combined_dataset_id(),
                 mapping_table=mapping_table)
         response = bq_utils.query(query)
         rows = bq_utils.response2rows(response)
         self.assertEqual(
             0, len(rows),
             "RDR records should map to records in mapping and combined tables"
         )
    def setUp(self):
        self.project_id = bq_utils.app_identity.get_application_id()
        self.dataset_id = bq_utils.get_combined_dataset_id()
        self.sandbox_dataset_id = bq_utils.get_unioned_dataset_id()
        if not self.project_id or not self.dataset_id:
            # TODO: Fix handling of globals, push these assertions down if they are required.
            raise ValueError(
                f"missing configuration for project ('{self.project_id}') " +
                f"and/or dataset ('{self.dataset_id}')")

        # TODO: Reconcile this with a consistent integration testing model. Ideally each test should
        # clean up after itself so that we don't need this defensive check.
        test_util.delete_all_tables(self.dataset_id)

        # drop concept table
        drop_concept_table(self.dataset_id)

        create_tables = ['person', 'observation']
        table_fields = {
            'person': 'post_deid_person',
            'observation': 'observation',
            'concept': 'concept'
        }
        for tbl in ['concept']:
            if not bq_utils.table_exists(tbl, dataset_id=self.dataset_id):
                create_tables.append(tbl)
        for tbl in create_tables:
            bq_utils.create_standard_table(table_fields[tbl],
                                           tbl,
                                           dataset_id=self.dataset_id,
                                           force_all_nullable=True)
Esempio n. 4
0
def clean_combined_dataset(project_id=None, dataset_id=None):
    """
    Run all clean rules defined for the ehr and rdr dataset.

    :param project_id:  Name of the BigQuery project.
    :param dataset_id:  Name of the dataset to clean
    """
    if project_id is None:
        project_id = app_identity.get_application_id()
        LOGGER.info('Project is unspecified.  Using default value of:\t%s',
                    project_id)

    if dataset_id is None:
        dataset_id = bq_utils.get_combined_dataset_id()
        LOGGER.info('Dataset is unspecified.  Using default value of:\t%s',
                    dataset_id)

    sandbox_dataset_id = sandbox.create_sandbox_dataset(project_id=project_id,
                                                        dataset_id=dataset_id)

    query_list = _gather_combined_queries(project_id, dataset_id,
                                          sandbox_dataset_id)

    LOGGER.info("Cleaning combined_dataset")
    clean_engine.clean_dataset(project_id, query_list, stage.COMBINED)
Esempio n. 5
0
def validate_pii():
    project = bq_utils.app_identity.get_application_id()
    combined_dataset = bq_utils.get_combined_dataset_id()
    ehr_dataset = bq_utils.get_dataset_id()
    dest_dataset = bq_utils.get_validation_results_dataset_id()
    logging.info('Calling match_participants')
    _, errors = matching.match_participants(project, combined_dataset,
                                            ehr_dataset, dest_dataset)

    if errors > 0:
        logging.error("Errors encountered in validation process")

    return consts.VALIDATION_SUCCESS
Esempio n. 6
0
def fact_relationship_query():
    """
    Load fact_relationship, using mapped IDs based on domain concept in fact 1 and fact 2
    :return:
    """
    return combine_consts.FACT_RELATIONSHIP_QUERY.format(
        rdr_dataset_id=bq_utils.get_rdr_dataset_id(),
        combined_dataset_id=bq_utils.get_combined_dataset_id(),
        mapping_measurement=mapping_table_for('measurement'),
        ehr_dataset=bq_utils.get_dataset_id(),
        mapping_observation=mapping_table_for('observation'),
        measurement_domain_concept_id=common.MEASUREMENT_DOMAIN_CONCEPT_ID,
        observation_domain_concept_id=common.OBSERVATION_DOMAIN_CONCEPT_ID)
Esempio n. 7
0
def person_query(table_name):
    """
    Maps location and care_Site id in person table

    :return: query
    """
    combined_dataset_id = bq_utils.get_combined_dataset_id()
    columns, join_expression = join_expression_generator(
        table_name, combined_dataset_id)
    return combine_consts.MAPPED_PERSON_QUERY.format(
        cols=columns,
        dataset=combined_dataset_id,
        table=table_name,
        join_expr=join_expression)
Esempio n. 8
0
def create_cdm_tables():
    """
    Create all CDM tables

    Note: Recreates any existing tables
    """
    combined_dataset_id = bq_utils.get_combined_dataset_id()
    for table in resources.CDM_TABLES:
        logger.info('Creating table {dataset}.{table}...'.format(
            table=table, dataset=combined_dataset_id))
        bq_utils.create_standard_table(table,
                                       table,
                                       drop_existing=True,
                                       dataset_id=combined_dataset_id)
Esempio n. 9
0
def query(q, dst_table_id, write_disposition='WRITE_APPEND'):
    """
    Run query and block until job is done
    :param q: SQL statement
    :param dst_table_id: if set, output is saved in a table with the specified id
    :param write_disposition: WRITE_TRUNCATE, WRITE_EMPTY, or WRITE_APPEND (default, to preserve schema)
    """
    dst_dataset_id = bq_utils.get_combined_dataset_id()
    query_job_result = bq_utils.query(q,
                                      destination_table_id=dst_table_id,
                                      write_disposition=write_disposition,
                                      destination_dataset_id=dst_dataset_id)
    query_job_id = query_job_result['jobReference']['jobId']
    incomplete_jobs = bq_utils.wait_on_jobs([query_job_id])
    if len(incomplete_jobs) > 0:
        raise bq_utils.BigQueryJobWaitError(incomplete_jobs)
    def test_execute_queries(self):
        project_id = bq_utils.app_identity.get_application_id()
        dataset_id = bq_utils.get_combined_dataset_id()
        sandbox_id = bq_utils.get_unioned_dataset_id()
        test_util.delete_all_tables(dataset_id)

        create_tables = (
            ['person'] + common.CLINICAL_DATA_TABLES +
            ['_mapping_' + t for t in common.MAPPED_CLINICAL_DATA_TABLES])
        # TODO(calbach): Make the setup/teardown of these concept tables hermetic.
        for tbl in ['concept', 'concept_ancestor']:
            if not bq_utils.table_exists(tbl, dataset_id=dataset_id):
                create_tables.push(tbl)
        for tbl in create_tables:
            bq_utils.create_standard_table(tbl,
                                           tbl,
                                           dataset_id=dataset_id,
                                           force_all_nullable=True)

        for tmpl in INSERT_FAKE_PARTICIPANTS_TMPLS:
            resp = bq_utils.query(
                tmpl.render(project_id=project_id,
                            dataset_id=dataset_id,
                            rdr_basics_concept_id=123,
                            rdr_consent_concept_id=345,
                            ehr_obs_concept_id=567,
                            rdr_basics_module_concept_id=
                            drop_participants_without_ppi_or_ehr.
                            BASICS_MODULE_CONCEPT_ID))
            self.assertTrue(resp["jobComplete"])

        clean_cdr_engine.clean_dataset(
            project_id, dataset_id, sandbox_id,
            [(drop_participants_without_ppi_or_ehr.get_queries, )])

        def table_to_person_ids(t):
            rows = bq_utils.response2rows(
                bq_utils.query("SELECT person_id FROM `{}.{}.{}`".format(
                    project_id, dataset_id, t)))
            return set([r["person_id"] for r in rows])

        # We expect participants 1, 5 to have been removed from all tables.
        self.assertEqual(set([2, 3, 4, 6]), table_to_person_ids("person"))
        self.assertEqual(set([2, 4, 6]), table_to_person_ids("observation"))
        self.assertEquals(set([3, 4]), table_to_person_ids("drug_exposure"))

        test_util.delete_all_tables(dataset_id)
Esempio n. 11
0
def copy_ehr_table(table):
    """
    Copy table from EHR (consenting participants only) to the combined dataset without regenerating ids

    Note: Overwrites if a table already exists
    """
    fields = resources.fields_for(table)
    field_names = [field['name'] for field in fields]
    if 'person_id' not in field_names:
        raise RuntimeError(
            'Cannot copy EHR table {table}. It is missing columns needed for consent filter'
            .format(table=table))
    q = combine_consts.COPY_EHR_QUERY.format(
        ehr_dataset_id=bq_utils.get_dataset_id(),
        table=table,
        ehr_consent_table_id=combine_consts.EHR_CONSENT_TABLE_ID,
        combined_dataset_id=bq_utils.get_combined_dataset_id())
    logger.info('Query for {table} is `{q}`'.format(table=table, q=q))
    query(q, table)
Esempio n. 12
0
def load_query(domain_table):
    """
    Returns query used to load a domain table

    :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence')
    :return:
    """
    rdr_dataset_id = bq_utils.get_rdr_dataset_id()
    ehr_dataset_id = bq_utils.get_dataset_id()
    combined_dataset_id = bq_utils.get_combined_dataset_id()
    mapping_table = mapping_table_for(domain_table)
    cols, join_expression = join_expression_generator(domain_table,
                                                      combined_dataset_id)

    return combine_consts.LOAD_QUERY.format(
        cols=cols,
        domain_table=domain_table,
        rdr_dataset_id=rdr_dataset_id,
        ehr_dataset_id=ehr_dataset_id,
        mapping_table=mapping_table,
        join_expr=join_expression,
        combined_dataset_id=combined_dataset_id)
Esempio n. 13
0
 def setUp(self):
     self.ehr_dataset_id = bq_utils.get_dataset_id()
     self.rdr_dataset_id = bq_utils.get_rdr_dataset_id()
     self.combined_dataset_id = bq_utils.get_combined_dataset_id()
     self.drc_bucket = gcs_utils.get_drc_bucket()
     test_util.delete_all_tables(self.combined_dataset_id)