Ejemplo n.º 1
0
def get_reroute_domain_mapping_queries(project_id, dataset_id):
    """
    The functions generates a list of query dicts for rerouting the mapping records to the 
    approapriate domain.

    :param project_id: the project_id in which the query is run
    :param dataset_id: the dataset_id in which the query is run
    :return: a list of query dicts for rerouting the mapping records to the corresponding mapping 
    table
    """
    queries = []

    for dest_table in domain_mapping.DOMAIN_TABLE_NAMES:
        # Figure out all possible rerouting source tables for a given destination table
        src_tables = [
            src_table for src_table in domain_mapping.DOMAIN_TABLE_NAMES
            if (src_table == dest_table)
            or domain_mapping.exist_domain_mappings(src_table, dest_table)
        ]

        queries.append({
            cdr_consts.QUERY:
            REROUTE_DOMAIN_MAPPING_RECORD_QUERY.render(project_id=project_id,
                                                       dataset_id=dataset_id,
                                                       src_tables=src_tables,
                                                       dest_table=dest_table),
            cdr_consts.DESTINATION_TABLE:
            mapping_table_for(dest_table),
            cdr_consts.DISPOSITION:
            bq_consts.WRITE_TRUNCATE,
            cdr_consts.DESTINATION_DATASET:
            dataset_id
        })
    return queries
Ejemplo n.º 2
0
 def _all_rdr_records_included(self):
     """
     All rdr records are included whether or not there is corresponding ehr record
     """
     for domain_table in DOMAIN_TABLES:
         mapping_table = mapping_table_for(domain_table)
         query = (
             'SELECT rt.{domain_table}_id as id '
             'FROM `{rdr_dataset_id}.{domain_table}` AS rt '
             'LEFT JOIN `{combined_dataset_id}.{mapping_table}` AS m '
             'ON rt.{domain_table}_id = m.src_{domain_table}_id '
             'WHERE '
             '  m.{domain_table}_id IS NULL '
             'OR NOT EXISTS '
             ' (SELECT 1 FROM `{combined_dataset_id}.{domain_table}` AS t '
             '  WHERE t.{domain_table}_id = m.{domain_table}_id)').format(
                 domain_table=domain_table,
                 rdr_dataset_id=bq_utils.get_rdr_dataset_id(),
                 combined_dataset_id=bq_utils.get_combined_dataset_id(),
                 mapping_table=mapping_table)
         response = bq_utils.query(query)
         rows = bq_utils.response2rows(response)
         self.assertEqual(
             0, len(rows),
             "RDR records should map to records in mapping and combined tables"
         )
Ejemplo n.º 3
0
 def _all_rdr_records_included(self):
     """
     All rdr records are included whether or not there is corresponding ehr record
     """
     for domain_table in DOMAIN_TABLES:
         mapping_table = mapping_table_for(domain_table)
         q = '''SELECT rt.{domain_table}_id as id
            FROM {rdr_dataset_id}.{domain_table} rt
            LEFT JOIN {ehr_rdr_dataset_id}.{mapping_table} m
            ON rt.{domain_table}_id = m.src_{domain_table}_id
            WHERE
              m.{domain_table}_id IS NULL
            OR NOT EXISTS
              (SELECT 1 FROM {ehr_rdr_dataset_id}.{domain_table} t
               WHERE t.{domain_table}_id = m.{domain_table}_id)'''.format(
             domain_table=domain_table,
             rdr_dataset_id=bq_utils.get_rdr_dataset_id(),
             ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(),
             mapping_table=mapping_table)
         response = bq_utils.query(q)
         rows = test_util.response2rows(response)
         self.assertEqual(
             0, len(rows),
             "RDR records should map to records in mapping and combined tables"
         )
Ejemplo n.º 4
0
    def _mapping_table_checks(self):
        """
        Check mapping tables exist, have correct schema, have expected number of records
        """
        where = (
            'WHERE EXISTS '
            '  (SELECT 1 FROM `{combined_dataset_id}.{ehr_consent_table_id}` AS c '
            '   WHERE t.person_id = c.person_id)').format(
                combined_dataset_id=self.combined_dataset_id,
                ehr_consent_table_id=EHR_CONSENT_TABLE_ID)
        ehr_counts = test_util.get_table_counts(self.ehr_dataset_id,
                                                DOMAIN_TABLES, where)
        rdr_counts = test_util.get_table_counts(self.rdr_dataset_id)
        combined_counts = test_util.get_table_counts(self.combined_dataset_id)
        output_tables = combined_counts.keys()
        expected_counts = dict()
        expected_diffs = ['observation']

        for table in DOMAIN_TABLES:
            expected_mapping_table = mapping_table_for(table)
            self.assertIn(expected_mapping_table, output_tables)
            expected_fields = resources.fields_for(expected_mapping_table)
            actual_table_info = bq_utils.get_table_info(
                expected_mapping_table, self.combined_dataset_id)
            actual_fields = actual_table_info.get('schema',
                                                  dict()).get('fields', [])
            actual_fields_norm = map(test_util.normalize_field_payload,
                                     actual_fields)
            self.assertCountEqual(expected_fields, actual_fields_norm)

            # Count should be sum of EHR and RDR
            # (except for tables like observation where extra records are created for demographics)
            if 'person_id' in [
                    field.get('name', '')
                    for field in resources.fields_for(table)
            ]:
                unconsented_ehr_records = self.get_unconsented_ehr_records_count(
                    table)
            else:
                unconsented_ehr_records = 0

            actual_count = combined_counts[expected_mapping_table]

            if table in expected_diffs:
                expected_count = actual_count
            else:
                expected_count = (ehr_counts[table] -
                                  unconsented_ehr_records) + rdr_counts[table]
            expected_counts[expected_mapping_table] = expected_count

        self.assertDictContainsSubset(expected_counts, combined_counts)
Ejemplo n.º 5
0
    def _mapping_table_checks(self):
        """
        Check mapping tables exist, have correct schema, have expected number of records
        """
        where = '''
                WHERE EXISTS
                   (SELECT 1 FROM {ehr_rdr_dataset_id}.{ehr_consent_table_id} c 
                    WHERE t.person_id = c.person_id)
                '''.format(ehr_rdr_dataset_id=self.combined_dataset_id,
                           ehr_consent_table_id=EHR_CONSENT_TABLE_ID)
        ehr_counts = test_util.get_table_counts(self.ehr_dataset_id,
                                                DOMAIN_TABLES, where)
        rdr_counts = test_util.get_table_counts(self.rdr_dataset_id)
        combined_counts = test_util.get_table_counts(self.combined_dataset_id)
        output_tables = combined_counts.keys()
        expected_counts = dict()
        expected_diffs = ['observation']
        self.maxDiff = None

        for t in DOMAIN_TABLES:
            expected_mapping_table = mapping_table_for(t)
            self.assertIn(expected_mapping_table, output_tables)
            expected_fields = resources.fields_for(expected_mapping_table)
            actual_table_info = bq_utils.get_table_info(
                expected_mapping_table, self.combined_dataset_id)
            actual_fields = actual_table_info.get('schema',
                                                  dict()).get('fields', [])
            actual_fields_norm = map(test_util.normalize_field_payload,
                                     actual_fields)
            self.assertItemsEqual(expected_fields, actual_fields_norm)

            # Count should be sum of EHR and RDR
            # (except for tables like observation where extra records are created for demographics)
            actual_count = combined_counts[expected_mapping_table]
            expected_count = actual_count if t in expected_diffs else ehr_counts[
                t] + rdr_counts[t]
            expected_counts[expected_mapping_table] = expected_count
        self.assertDictContainsSubset(expected=expected_counts,
                                      actual=combined_counts)
Ejemplo n.º 6
0
def get_clean_domain_queries(project_id, dataset_id, sandbox_dataset_id):
    """
    This function generates a list of query dicts for dropping records that do not belong to the 
    domain table after rerouting. 
    
    :param project_id: 
    :param dataset_id: 
    :param sandbox_dataset_id: 
    :return: 
    """

    queries = []
    sandbox_queries = []
    for domain_table in domain_mapping.DOMAIN_TABLE_NAMES:
        sandbox_queries.append({
            cdr_consts.QUERY:
            SANDBOX_DOMAIN_RECORD_QUERY_TEMPLATE.render(
                project_id=project_id,
                dataset_id=dataset_id,
                domain_table=domain_table),
            cdr_consts.DESTINATION_TABLE:
            sandbox_name_for(domain_table),
            cdr_consts.DISPOSITION:
            bq_consts.WRITE_TRUNCATE,
            cdr_consts.DESTINATION_DATASET:
            sandbox_dataset_id
        })
        # add the clean-up query for the domain table
        queries.append({
            cdr_consts.QUERY:
            CLEAN_DOMAIN_RECORD_QUERY_TEMPLATE.render(
                project_id=project_id,
                dataset_id=dataset_id,
                sandbox_dataset_id=sandbox_dataset_id,
                domain_table=domain_table,
                sandbox_table=sandbox_name_for(domain_table),
                is_mapping=False),
            cdr_consts.DESTINATION_TABLE:
            domain_table,
            cdr_consts.DISPOSITION:
            bq_consts.WRITE_TRUNCATE,
            cdr_consts.DESTINATION_DATASET:
            dataset_id
        })
        # add the clean-up query for the corresponding mapping of the domain table
        queries.append({
            cdr_consts.QUERY:
            CLEAN_DOMAIN_RECORD_QUERY_TEMPLATE.render(
                project_id=project_id,
                dataset_id=dataset_id,
                sandbox_dataset_id=sandbox_dataset_id,
                domain_table=domain_table,
                sandbox_table=sandbox_name_for(domain_table),
                is_mapping=True),
            cdr_consts.DESTINATION_TABLE:
            mapping_table_for(domain_table),
            cdr_consts.DISPOSITION:
            bq_consts.WRITE_TRUNCATE,
            cdr_consts.DESTINATION_DATASET:
            dataset_id
        })
    return sandbox_queries + queries
Ejemplo n.º 7
0
def get_clean_domain_queries(project_id, dataset_id, sandbox_dataset_id):
    """
    This function generates a list of query dicts for dropping records that do not belong to the
    domain table after rerouting.
    
    :param project_id: the project_id in which the query is run
    :param dataset_id: the dataset_id in which the query is run
    :param sandbox_dataset_id: sandbox dataset for dataset_id
    :return: list of query dicts to run
    """

    queries = []
    sandbox_queries = []
    for domain_table in domain_mapping.DOMAIN_TABLE_NAMES:
        #Use non-standard concept if table is observation
        if domain_table == OBSERVATION:
            domain_concept_id = 'observation_source_concept_id'
        else:
            domain_concept_id = resources.get_domain_concept_id(domain_table)

        sandbox_queries.append({
            cdr_consts.QUERY:
            SANDBOX_DOMAIN_RECORD_QUERY_TEMPLATE.render(
                project_id=project_id,
                dataset_id=dataset_id,
                domain_table=domain_table,
                domain_concept_id=domain_concept_id),
            cdr_consts.DESTINATION_TABLE:
            sandbox_name_for(domain_table),
            cdr_consts.DISPOSITION:
            bq_consts.WRITE_TRUNCATE,
            cdr_consts.DESTINATION_DATASET:
            sandbox_dataset_id
        })
        # add the clean-up query for the domain table
        queries.append({
            cdr_consts.QUERY:
            CLEAN_DOMAIN_RECORD_QUERY_TEMPLATE.render(
                project_id=project_id,
                dataset_id=dataset_id,
                sandbox_dataset_id=sandbox_dataset_id,
                domain_table=domain_table,
                sandbox_table=sandbox_name_for(domain_table),
                is_mapping=False),
            cdr_consts.DESTINATION_TABLE:
            domain_table,
            cdr_consts.DISPOSITION:
            bq_consts.WRITE_TRUNCATE,
            cdr_consts.DESTINATION_DATASET:
            dataset_id
        })
        # add the clean-up query for the corresponding mapping of the domain table
        queries.append({
            cdr_consts.QUERY:
            CLEAN_DOMAIN_RECORD_QUERY_TEMPLATE.render(
                project_id=project_id,
                dataset_id=dataset_id,
                sandbox_dataset_id=sandbox_dataset_id,
                domain_table=domain_table,
                sandbox_table=sandbox_name_for(domain_table),
                is_mapping=True),
            cdr_consts.DESTINATION_TABLE:
            mapping_table_for(domain_table),
            cdr_consts.DISPOSITION:
            bq_consts.WRITE_TRUNCATE,
            cdr_consts.DESTINATION_DATASET:
            dataset_id
        })
    return sandbox_queries + queries