Exemple #1
0
    def test_fields_for(self):
        """
        Testing that fields for works as expected with sub-directory structures.
        """
        # preconditions

        # test
        actual_fields = resources.fields_for('person')

        # post conditions
        person_path = os.path.join(resources.base_path, 'resource_files',
                                   'schemas', 'cdm', 'clinical', 'person.json')
        with open(person_path, 'r') as fp:
            expected_fields = json.load(fp)

        self.assertEqual(actual_fields, expected_fields)

        # test
        actual_fields = resources.fields_for('person_ext')
        person_ext_path = os.path.join(resources.base_path, 'resource_files',
                                       'schemas', 'extension_tables',
                                       'person_ext.json')
        with open(person_ext_path, 'r') as fp:
            expected_fields = json.load(fp)

        self.assertEqual(actual_fields, expected_fields)
Exemple #2
0
    def test_get_person_id_validation_queries_deid(self):
        # pre conditions

        # test
        results = validator.get_person_id_validation_queries('foo', 'bar_deid')

        # post conditions
        self.assertEqual(len(results), ((len(self.all_tables) * 2) - 1))

        existing_and_consenting = validator.EXISTING_AND_VALID_CONSENTING_RECORDS
        existing_in_person_table = validator.SELECT_EXISTING_PERSON_IDS

        expected = []
        for table in self.mapped_tables:
            field_names = [
                'entry.' + field['name']
                for field in resources.fields_for(table)
            ]
            fields = ', '.join(field_names)

            expected.append({
                clean_consts.QUERY:
                existing_and_consenting.format(project='foo',
                                               dataset='bar_deid',
                                               mapping_dataset='bar',
                                               table=table,
                                               fields=fields),
                clean_consts.DESTINATION_TABLE:
                table,
                clean_consts.DESTINATION_DATASET:
                'bar_deid',
                clean_consts.DISPOSITION:
                bq_consts.WRITE_TRUNCATE,
            })

        for table in self.all_tables:
            field_names = [
                'entry.' + field['name']
                for field in resources.fields_for(table)
            ]
            fields = ', '.join(field_names)

            expected.append({
                clean_consts.QUERY:
                existing_in_person_table.format(project='foo',
                                                dataset='bar_deid',
                                                table=table,
                                                fields=fields),
                clean_consts.DESTINATION_TABLE:
                table,
                clean_consts.DESTINATION_DATASET:
                'bar_deid',
                clean_consts.DISPOSITION:
                bq_consts.WRITE_TRUNCATE,
            })

        self.assertEqual(expected, results)
    def _mapping_table_checks(self):
        """
        Check mapping tables exist, have correct schema, have expected number of records
        """
        where = (
            'WHERE EXISTS '
            '  (SELECT 1 FROM `{combined_dataset_id}.{ehr_consent_table_id}` AS c '
            '   WHERE t.person_id = c.person_id)').format(
                combined_dataset_id=self.combined_dataset_id,
                ehr_consent_table_id=EHR_CONSENT_TABLE_ID)
        ehr_counts = test_util.get_table_counts(self.ehr_dataset_id,
                                                DOMAIN_TABLES, where)
        rdr_counts = test_util.get_table_counts(self.rdr_dataset_id)
        combined_counts = test_util.get_table_counts(self.combined_dataset_id)
        output_tables = combined_counts.keys()
        expected_counts = dict()
        expected_diffs = ['observation']

        for table in DOMAIN_TABLES:
            expected_mapping_table = mapping_table_for(table)
            self.assertIn(expected_mapping_table, output_tables)
            expected_fields = resources.fields_for(expected_mapping_table)
            actual_table_info = bq_utils.get_table_info(
                expected_mapping_table, self.combined_dataset_id)
            actual_fields = actual_table_info.get('schema',
                                                  dict()).get('fields', [])
            actual_fields_norm = map(test_util.normalize_field_payload,
                                     actual_fields)
            self.assertCountEqual(expected_fields, actual_fields_norm)

            # Count should be sum of EHR and RDR
            # (except for tables like observation where extra records are created for demographics)
            if 'person_id' in [
                    field.get('name', '')
                    for field in resources.fields_for(table)
            ]:
                unconsented_ehr_records = self.get_unconsented_ehr_records_count(
                    table)
            else:
                unconsented_ehr_records = 0

            actual_count = combined_counts[expected_mapping_table]

            if table in expected_diffs:
                expected_count = actual_count
            else:
                expected_count = (ehr_counts[table] -
                                  unconsented_ehr_records) + rdr_counts[table]
            expected_counts[expected_mapping_table] = expected_count

        self.assertDictContainsSubset(expected_counts, combined_counts)
    def get_query_specs(self, *args, **keyword_args) -> query_spec_list:
        queries_list = []

        for table in self.affected_tables:
            fields = [field['name'] for field in resources.fields_for(table)]
            fields_to_replace = get_fields_dict(table, fields)

            if fields_to_replace:
                cols = get_modified_columns(fields, fields_to_replace)

                full_join_expression = get_full_join_expression(
                    self.dataset_id, self.project_id, fields_to_replace)

                query = dict()
                query[cdr_consts.QUERY] = FIELD_REPLACE_QUERY.render(
                    columns=cols,
                    table_name=table,
                    dataset=self.dataset_id,
                    project=self.project_id,
                    join_expression=full_join_expression)
                query[cdr_consts.DESTINATION_TABLE] = table
                query[cdr_consts.DISPOSITION] = bq_consts.WRITE_TRUNCATE
                query[cdr_consts.DESTINATION_DATASET] = self.dataset_id
                queries_list.append(query)
        return queries_list
def parse_src_concept_id_update_query(project_id, dataset_id, table_name):
    """
    Fill in template query used to generate updated domain table

    :param project_id: identifies the project containing the dataset
    :param dataset_id: identifies the dataset containing the OMOP data
    :param table_name: name of a domain table
    :return: parsed src_concept_id_update query
    """
    fields = [field['name'] for field in resources.fields_for(table_name)]
    col_exprs = []
    fields_to_replace = {
        resources.get_domain_id_field(table_name): 'dest_id',
        resources.get_domain_concept_id(table_name): 'new_concept_id',
        resources.get_domain_source_concept_id(table_name): 'new_src_concept_id'
    }
    for field_name in fields:
        if field_name in fields_to_replace:
            col_expr = 'coalesce({replace_field}, {field}) AS {field}'.format(
                replace_field=fields_to_replace[field_name],
                field=field_name)
        else:
            col_expr = field_name
        col_exprs.append(col_expr)
    cols = ', '.join(col_exprs)

    return SRC_CONCEPT_ID_UPDATE_QUERY.format(cols=cols,
                                              project=project_id,
                                              dataset=dataset_id,
                                              domain_table=table_name,
                                              logging_table=SRC_CONCEPT_ID_TABLE_NAME)
def parse_mapping_table_update_query(project_id, dataset_id, table_name, mapping_table_name):
    """

    Fill in mapping tables query so it either gets dest_id from the logging table or the domain table

    :param project_id: identifies the project containing the dataset
    :param dataset_id: identifies the dataset containing the OMOP data
    :param table_name: name of the domain table for which the query needs to be parsed
    :param mapping_table_name: name of the mapping_table for which the query needs to be parsed
    :return:
    """
    fields = [field['name'] for field in resources.fields_for(mapping_table_name)]
    col_exprs = []
    for field_name in fields:
        if field_name == resources.get_domain_id_field(table_name):
            col_expr = 'coalesce(dest_id, {field}) AS {field}'.format(field=field_name)
        else:
            col_expr = field_name
        col_exprs.append(col_expr)
    cols = ', '.join(col_exprs)
    return UPDATE_MAPPING_TABLES_QUERY.format(cols=cols,
                                              project=project_id,
                                              dataset=dataset_id,
                                              mapping_table=mapping_table_name,
                                              logging_table=SRC_CONCEPT_ID_TABLE_NAME,
                                              domain_table=table_name)
Exemple #7
0
def get_queries(project=None, dataset=None):
    """
    Return a list of queries to remove data for missing persons.

    Removes data from person_id linked tables for any persons which do not
    exist in the person table.

    :return:  A list of string queries that can be executed to delete data from
        other tables for non-person users.
    """
    query_list = []
    for table in common.CLINICAL_DATA_TABLES:
        field_names = [
            'entry.' + field['name'] for field in resources.fields_for(table)
        ]
        fields = ', '.join(field_names)

        delete_query = SELECT_EXISTING_PERSON_IDS.format(project=project,
                                                         dataset=dataset,
                                                         table=table,
                                                         fields=fields)

        query_list.append({
            clean_consts.QUERY: delete_query,
            clean_consts.DESTINATION_TABLE: table,
            clean_consts.DESTINATION_DATASET: dataset,
            clean_consts.DISPOSITION: bq_consts.WRITE_TRUNCATE,
        })

    return query_list
def mapping_query(domain_table):
    """
    Returns query used to get mapping of all records from RDR combined with EHR records of consented participants

    :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence')
    :return:
    """

    if combine_consts.PERSON_ID in [
            field['name'] for field in resources.fields_for(domain_table)
    ]:
        return combine_consts.MAPPING_QUERY_WITH_PERSON_CHECK.format(
            rdr_dataset_id=bq_utils.get_rdr_dataset_id(),
            ehr_dataset_id=bq_utils.get_dataset_id(),
            ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(),
            domain_table=domain_table,
            mapping_constant=common.RDR_ID_CONSTANT,
            ehr_consent_table_id=combine_consts.EHR_CONSENT_TABLE_ID)
    else:
        return combine_consts.MAPPING_QUERY_WITHOUT_PERSON_CHECK.format(
            rdr_dataset_id=bq_utils.get_rdr_dataset_id(),
            ehr_dataset_id=bq_utils.get_dataset_id(),
            ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(),
            domain_table=domain_table,
            mapping_constant=common.RDR_ID_CONSTANT)
Exemple #9
0
def get_table_fields(table, ext_table_id):
    """
    Generates fields for ext tables for the provided cdm table

    :param table: cdm table to generate ext fields for
    :param ext_table_id: cdm table extension name.  used to load schema
        defintion if it exists as a json file
    :return: dict containing ext fields for the cdm table
    """
    table_fields = []

    try:
        table_fields = fields_for(ext_table_id)
        LOGGER.info(
            f"using json schema file definition for table: {ext_table_id}")
    except (RuntimeError):
        for field in EXT_FIELD_TEMPLATE:
            table_field = dict()
            for key in field:
                table_field[key] = field[key].format(table=table)
            table_fields.append(table_field)
        LOGGER.info(
            f"using dynamic extension table schema for table: {ext_table_id}")

    return table_fields
Exemple #10
0
    def create_filter_list(self, filter_list):
        """
        Create a list of filter expressions.

        :param filter_list: a list of dictionaries representing the filter
            expression to be used as part of an SQL WHERE clause

        :return: a list of strings representing the expressions.
        """
        field_definitions = {}
        for field_def in fields_for(self.tablename):
            field_definitions[field_def.get('name')] = field_def

        string_list = []
        for item in filter_list:
            string, field = create_on_string(item)

            field_definition = field_definitions.get(field)
            field_mode = field_definition.get('mode').lower()

            # if based on a nullable field, make sure to use the exists function
            if field_mode.lower() == 'nullable':
                item['qualifier'] = item['qualifier'].upper()
                string, _ = create_on_string(item)
                nullable_str = (
                    ' exists (SELECT * FROM `:idataset.observation` AS record2 '
                    'WHERE :join_tablename.observation_id = record2.observation_id '
                    'AND {conditional})')
                string = nullable_str.format(conditional=string)

            string_list.append(string)

        return string_list
Exemple #11
0
def get_table_schema(table_name, fields=None):
    """
    A helper function to create big query SchemaFields for dictionary definitions.

    Given the table name, reads the schema from the schema definition file
    and returns a list of SchemaField objects that can be used for table
    creation.

    :param table_name:  the table name to get BigQuery SchemaField information
        for.
    :param fields: An optional argument to provide fields/schema as a list of JSON objects
    :returns:  a list of SchemaField objects representing the table's schema.
    """
    if fields:
        fields = fields
    else:
        fields = fields_for(table_name)

    schema = []
    for column in fields:
        name = column.get('name')
        field_type = column.get('type')
        mode = column.get('mode')
        description = column.get('description')
        column_def = bigquery.SchemaField(name, field_type, mode, description)

        schema.append(column_def)

    return schema
Exemple #12
0
    def test_fields_for_duplicate_files(self, mock_walk):
        """
        Testing that fields for works as expected with sub-directory structures.

        Verifies that if duplicates are detected and no distinction is made as
        to which one is wanted, an error is raised.  Also shows that if duplicate
        file names exist in separate directories, if the named sub-directory is
        searched and the file is found, this file is opened and read.
        """
        # preconditions
        sub_dir = 'baz'
        # mocks result tuples for os.walk
        walk_results = [(os.path.join('resource_files', 'schemas'), [sub_dir],
                         ['duplicate.json', 'unique1.json']),
                        (os.path.join('resource_files', 'schemas', sub_dir),
                         [], ['duplicate.json', 'unique2.json'])]

        mock_walk.return_value = walk_results

        # test
        self.assertRaises(RuntimeError, resources.fields_for, 'duplicate')

        # test
        data = '[{"id": "fake id desc", "type": "fake type"}]'
        json_data = json.loads(data)
        with mock.patch('resources.open',
                        mock.mock_open(read_data=data)) as mock_file:
            with mock.patch('resources.json.load') as mock_json:
                mock_json.return_value = json_data
                actual_fields = resources.fields_for('duplicate', sub_dir)
                self.assertEqual(actual_fields, json_data)
Exemple #13
0
    def test_validate_five_persons_success(self, mock_check_cron):
        expected_results = []
        test_file_names = [
            os.path.basename(f) for f in test_util.FIVE_PERSONS_FILES
        ]

        for cdm_file in common.SUBMISSION_FILES:
            if cdm_file in test_file_names:
                expected_result = (cdm_file, 1, 1, 1)
                test_file = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_file)
                test_util.write_cloud_file(self.hpo_bucket,
                                           test_file,
                                           prefix=self.folder_prefix)
            else:
                expected_result = (cdm_file, 0, 0, 0)
            expected_results.append(expected_result)
        bucket_items = gcs_utils.list_bucket(self.hpo_bucket)
        folder_items = main.get_folder_items(bucket_items, self.folder_prefix)
        r = main.validate_submission(self.hpo_id, self.hpo_bucket,
                                     folder_items, self.folder_prefix)
        self.assertSetEqual(set(r['results']), set(expected_results))

        # check tables exist and are clustered as expected
        for table in resources.CDM_TABLES + common.PII_TABLES:
            table_id = bq_utils.get_table_id(test_util.FAKE_HPO_ID, table)
            table_info = bq_utils.get_table_info(table_id)
            fields = resources.fields_for(table)
            field_names = [field['name'] for field in fields]
            if 'person_id' in field_names:
                self.table_has_clustering(table_info)
Exemple #14
0
def store_participant_data(df, project_id, destination_table):
    """
    Stores the fetched participant data in a BigQuery dataset. If the
    table doesn't exist, it will create that table. If the table does
    exist, it will append the data onto that designated table.

    :param df: pandas dataframe created to hold participant data fetched from ParticipantSummary API
    :param project_id: identifies the project
    :param destination_table: name of the table to be written in the form of dataset.tablename

    :return: returns a dataset with the participant data
    """

    # Parameter check
    if not isinstance(project_id, str):
        raise RuntimeError(
            f'Please specify the project in which to create the tables')

    table_schema = fields_for(destination_table.split('.')[-1])

    return pandas_gbq.to_gbq(df,
                             destination_table,
                             project_id,
                             if_exists="replace",
                             table_schema=table_schema)
Exemple #15
0
def parse_cdr_metadata_args(args=None):
    fields = resources.fields_for(METADATA_TABLE)

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        '--component',
        required=True,
        help='Job specification for adding data to metadata table',
        choices=list(JOB_COMPONENTS))
    parser.add_argument('--project_id',
                        required=True,
                        help='Identifies the dataset to copy metadata from')
    parser.add_argument('--target_dataset',
                        default=True,
                        help='Identifies the dataset to copy metadata to')
    parser.add_argument('--source_dataset',
                        default=None,
                        help='Identifies the dataset to copy metadata from')

    for field in fields:
        parser.add_argument(f'--{field[NAME]}',
                            default=None,
                            help=f'{field[DESCRIPTION]}')

    cdr_metadata_args, unknown_args = parser.parse_known_args(args)
    custom_args = clean_cdr._get_kwargs(unknown_args)
    return cdr_metadata_args, custom_args
Exemple #16
0
def get_table_cols_df(client, project_id, dataset_id):
    """
    Returns a df of dataset's INFORMATION_SCHEMA.COLUMNS

    :param project_id: bq name of project_id
    :param dataset_id: ba name of dataset_id
    :param client: bq client object
    :return: dataframe of columns from INFORMATION_SCHEMA
    """
    table_cols_df = pd.DataFrame()
    if client:
        LOGGER.info(
            f"Getting column information from live dataset: `{dataset_id}`")
        # if possible, read live table schemas
        table_cols_query = TABLE_INFORMATION_SCHEMA.render(project=project_id,
                                                           dataset=dataset_id)
        table_cols_df = client.query(table_cols_query).to_dataframe()
    else:
        # if None is passed to the client, read the table data from JSON schemas
        # generate a dataframe from schema files
        LOGGER.info("Getting column information from schema files")
        table_dict_list = []
        for table in FITBIT_TABLES + CDM_TABLES:
            table_fields = resources.fields_for(table)
            for field in table_fields:
                field['table_name'] = table
            table_dict_list.extend(table_fields)

        table_cols_df = pd.DataFrame(table_dict_list)
        table_cols_df = table_cols_df.rename(columns={"name": "column_name"})

    return table_cols_df
Exemple #17
0
def get_id_deduplicate_queries(project_id, dataset_id):
    """
    This function gets the queries required to remove the duplicate id columns from a dataset

    :param project_id: Project name
    :param dataset_id: Name of the dataset where a rule should be applied
    :return: a list of queries.
    """
    queries = []
    tables_with_primary_key = cdm.tables_to_map()
    for table in tables_with_primary_key:
        table_name = table
        fields = resources.fields_for(table)
        # Generate column expressions for select
        col_exprs = [field['name'] for field in fields]
        cols = ', '.join(col_exprs)
        query = dict()
        query[cdr_consts.QUERY] = ID_DE_DUP_QUERY.format(columns=cols,
                                                         project_id=project_id,
                                                         dataset_id=dataset_id,
                                                         domain_table=table,
                                                         table_name=table_name)

        query[cdr_consts.DESTINATION_TABLE] = table
        query[cdr_consts.DISPOSITION] = bq_consts.WRITE_TRUNCATE
        query[cdr_consts.DESTINATION_DATASET] = dataset_id
        queries.append(query)
    return queries
Exemple #18
0
def get_fill_freetext_source_value_fields_queries(project_id, dataset_id):
    """

    Generates queries to replace the source_value_fields with the concept_code.

    :param project_id: Name of the project where the dataset on which the rules are to be applied on
    :param dataset_id: Name of the dataset on which the rules are to be applied on
    :return: A list of queries to be run.
    """
    queries_list = []
    for table in resources.CDM_TABLES:
        fields = [field['name'] for field in resources.fields_for(table)]
        fields_to_replace = get_fields_dict(table, fields)

        if fields_to_replace:
            cols = get_modified_columns(fields, fields_to_replace)

            full_join_expression = get_full_join_expression(
                dataset_id, project_id, fields_to_replace)

            query = dict()
            query[cdr_consts.QUERY] = FIELD_REPLACE_QUERY.format(
                columns=cols,
                table_name=table,
                dataset=dataset_id,
                project=project_id,
                join_expression=full_join_expression)
            query[cdr_consts.DESTINATION_TABLE] = table
            query[cdr_consts.DISPOSITION] = bq_consts.WRITE_TRUNCATE
            query[cdr_consts.DESTINATION_DATASET] = dataset_id
            queries_list.append(query)
    return queries_list
Exemple #19
0
    def _load_data(self):

        # Load measurement_concept_sets
        required_labs.load_measurement_concept_sets_table(
            project_id=self.project_id, dataset_id=self.dataset_id)
        # Load measurement_concept_sets_descendants
        required_labs.load_measurement_concept_sets_descendants_table(
            project_id=self.project_id, dataset_id=self.dataset_id)

        # Create concept and concept_ancestor empty tables if not exist
        if not bq_utils.table_exists(common.CONCEPT, self.dataset_id):
            bq_utils.create_standard_table(common.CONCEPT, common.CONCEPT)
        if not bq_utils.table_exists(common.CONCEPT, self.dataset_id):
            bq_utils.create_standard_table(common.CONCEPT_ANCESTOR,
                                           common.CONCEPT_ANCESTOR)

        # we need to load measurement.csv into bigquery_dataset_id in advance for the other integration tests
        ehr_measurement_result = bq_utils.load_table_from_csv(
            project_id=self.project_id,
            dataset_id=self.dataset_id,
            table_name=bq_utils.get_table_id(FAKE_HPO_ID, common.MEASUREMENT),
            csv_path=test_util.FIVE_PERSONS_MEASUREMENT_CSV,
            fields=resources.fields_for(common.MEASUREMENT))
        bq_utils.wait_on_jobs(
            [ehr_measurement_result['jobReference']['jobId']])
Exemple #20
0
def get_id_deduplicate_queries(project_id, dataset_id):
    """
    This function gets the queries required to remove the duplicate id columns from a dataset

    :param project_id: Project name
    :param dataset_id: Name of the dataset where a rule should be applied
    :return: a list of queries.
    """
    queries = []
    tables_with_primary_key = cdm.tables_to_map()
    for table in tables_with_primary_key:
        if 'unioned' in dataset_id:
            table_name = 'unioned_ehr_{table}'.format(table=table)
        else:
            table_name = table
        if bq_utils.table_exists(table_name, dataset_id):
            fields = resources.fields_for(table)
            # Generate column expressions for select
            col_exprs = [field['name'] for field in fields]
            cols = ',\n        '.join(col_exprs)
            query = ID_DE_DUP_QUERY.format(columns=cols,
                                           project_id=project_id,
                                           dataset_id=dataset_id,
                                           domain_table=table,
                                           table_name=table_name)
            queries.append(query)
    return queries
Exemple #21
0
    def test_measurement_concept_sets_table(self):

        query = sql_wrangle.qualify_tables(
            '''SELECT * FROM {dataset_id}.{table_id}'''.format(
                dataset_id=self.dataset_id,
                table_id=MEASUREMENT_CONCEPT_SETS_TABLE))
        response = bq_utils.query(query)

        actual_fields = [{
            'name': field['name'].lower(),
            'type': field['type'].lower()
        } for field in response['schema']['fields']]

        expected_fields = [{
            'name': field['name'].lower(),
            'type': field['type'].lower()
        } for field in resources.fields_for(MEASUREMENT_CONCEPT_SETS_TABLE)]

        self.assertListEqual(expected_fields, actual_fields)

        measurement_concept_sets_table_path = os.path.join(
            resources.resource_path, MEASUREMENT_CONCEPT_SETS_TABLE + '.csv')
        expected_total_rows = len(
            resources.csv_to_list(measurement_concept_sets_table_path))
        self.assertEqual(expected_total_rows, int(response['totalRows']))
Exemple #22
0
def get_upgrade_table_query(client, dataset_id, table_id, hpo_id=None):
    """
    Generate query for specified tables

    :param client: BQ Client
    :param dataset_id: Source dataset
    :param table_id: Source table
    :param hpo_id: 
    :return: 
    """
    try:
        source_table = f'{client.project}.{dataset_id}.{table_id}'
        source_fields = sq.get_source_fields(client, source_table)
        dst_fields = resources.fields_for(
            resources.get_base_table_name(table_id, hpo_id))
        col_cast_exprs = [
            get_field_cast_expr_with_schema_change(field, source_fields)
            for field in dst_fields
        ]
        col_expr = ', '.join(col_cast_exprs)
    except (OSError, IOError, RuntimeError):
        # default to select *
        col_expr = '*'
    select_all_query = 'SELECT {col_expr} FROM `{project_id}.{dataset_id}.{table_id}`'
    return select_all_query.format(col_expr=col_expr,
                                   project_id=client.project,
                                   dataset_id=dataset_id,
                                   table_id=table_id)
def get_non_match_participant_query(project_id, validation_dataset_id,
                                    identity_match_table):
    """
    This function generates the query for identifying non_match participants query flagged by the DRC match algorithm
    
    :param project_id: 
    :param validation_dataset_id: 
    :param identity_match_table: 
    :return: 
    """

    # if any of the two of first_name, last_name and birthday are missing, this is a non-match
    num_of_missing_key_fields = CRITERION_COLUMN_TEMPLATE.format(
        column_expr=get_missing_criterion(KEY_FIELDS),
        num_of_missing=NUM_OF_MISSING_KEY_FIELDS)

    identity_match_fields = [
        field['name'] for field in resources.fields_for(IDENTITY_MATCH)
        if field['name'] not in IDENTITY_MATCH_EXCLUDED_FIELD
    ]
    # if the total number of missings is equal to and bigger than 4, this is a non-match
    num_of_missing_all_fields = CRITERION_COLUMN_TEMPLATE.format(
        column_expr=get_missing_criterion(identity_match_fields),
        num_of_missing=NUM_OF_MISSING_ALL_FIELDS)
    # instantiate the query for identifying the non-match participants in the validation_dataset
    select_non_match_participants_query = SELECT_NON_MATCH_PARTICIPANTS_QUERY.format(
        project_id=project_id,
        validation_dataset_id=validation_dataset_id,
        identity_match_table=identity_match_table,
        key_fields_criteria=num_of_missing_key_fields,
        all_fields_criteria=num_of_missing_all_fields)

    return select_non_match_participants_query
def null_invalid_foreign_keys(project_id, dataset_id, sandbox_dataset_id=None):
    """
    This method gets the queries required to make invalid foreign keys null

    :param project_id: Project associated with the input and output datasets
    :param dataset_id: Dataset where cleaning rules are to be applied
    :param sandbox_dataset_id: Identifies the sandbox dataset to store rows 
    #TODO use sandbox_dataset_id for CR
    :return: a list of queries
    """
    queries_list = []
    for table in resources.CDM_TABLES:
        field_names = [field['name'] for field in resources.fields_for(table)]
        foreign_keys_flags = []
        fields_to_join = []

        for field_name in field_names:
            if field_name in FOREIGN_KEYS_FIELDS and field_name != table + '_id':
                fields_to_join.append(field_name)
                foreign_keys_flags.append(field_name)

        if fields_to_join:
            col_exprs = []
            for field in field_names:
                if field in fields_to_join:
                    if field in foreign_keys_flags:
                        col_expr = '{x}.'.format(x=field[:3]) + field
                else:
                    col_expr = field
                col_exprs.append(col_expr)
            cols = ', '.join(col_exprs)

            join_expression = []
            for key in FOREIGN_KEYS_FIELDS:
                if key in foreign_keys_flags:
                    if key == 'person_id':
                        table_alias = cdr_consts.PERSON_TABLE_NAME
                    else:
                        table_alias = _mapping_table_for(
                            '{x}'.format(x=key)[:-3])
                    join_expression.append(
                        LEFT_JOIN.format(dataset_id=dataset_id,
                                         prefix=key[:3],
                                         field=key,
                                         table=table_alias))

            full_join_expression = " ".join(join_expression)
            query = dict()
            query[cdr_consts.QUERY] = INVALID_FOREIGN_KEY_QUERY.format(
                cols=cols,
                table_name=table,
                dataset_id=dataset_id,
                project=project_id,
                join_expr=full_join_expression)
            query[cdr_consts.DESTINATION_TABLE] = table
            query[cdr_consts.DISPOSITION] = bq_consts.WRITE_TRUNCATE
            query[cdr_consts.DESTINATION_DATASET] = dataset_id
            queries_list.append(query)
    return queries_list
    def test_get_query_spec(self):
        # Pre conditions
        self.assertEqual(self.rule_instance.affected_datasets,
                         [cdr_consts.COMBINED])

        # Test
        result_list = self.rule_instance.get_query_specs()

        # Post conditions
        expected_query_list = []
        expected_sandbox_query_list = []
        for table in table_dates:
            sandbox_query = dict()
            sandbox_query[cdr_consts.QUERY] = SANDBOX_BAD_END_DATES.render(
                project_id=self.project_id,
                sandbox_id=self.sandbox_dataset_id,
                intermediary_table=self.rule_instance.sandbox_table_for(table),
                dataset_id=self.dataset_id,
                table=table,
                table_end_date=table_dates[table][1],
                table_start_date=table_dates[table][0])
            expected_sandbox_query_list.append(sandbox_query)

            fields = resources.fields_for(table)
            col_exprs = [
                'r.' + field['name'] if field['name'] == table_dates[table][1]
                else 'l.' + field['name'] for field in fields
            ]
            cols = ', '.join(col_exprs)
            query = dict()
            query[cdr_consts.QUERY] = NULL_BAD_END_DATES.render(
                project_id=self.project_id,
                dataset_id=self.dataset_id,
                cols=cols,
                table=table,
                table_start_date=table_dates[table][0],
                table_end_date=table_dates[table][1])
            expected_query_list.append(query)
        sandbox_query = dict()
        sandbox_query[cdr_consts.QUERY] = SANDBOX_BAD_END_DATES.render(
            project_id=self.project_id,
            sandbox_id=self.sandbox_dataset_id,
            intermediary_table=self.rule_instance.sandbox_table_for(
                visit_occurrence),
            dataset_id=self.dataset_id,
            table=visit_occurrence,
            table_end_date=visit_occurrence_dates[visit_occurrence][1],
            table_start_date=visit_occurrence_dates[visit_occurrence][0])
        expected_sandbox_query_list.append(sandbox_query)
        query = dict()
        query[cdr_consts.QUERY] = POPULATE_VISIT_END_DATES.render(
            project_id=self.project_id,
            dataset_id=self.dataset_id,
            placeholder_date=placeholder_date)
        expected_query_list.append(query)

        expected_list = expected_sandbox_query_list + expected_query_list

        self.assertEqual(result_list, expected_list)
Exemple #26
0
def main(raw_args=None):
    """
    Execute deid as a single script.

    Responsible for aggregating the tables deid will execute on and calling deid.
    """
    args = parse_args(raw_args)
    add_console_logging(args.console_log)
    known_tables = get_known_tables(fields_path)
    deid_tables_path = os.path.join(DEID_PATH, 'config', 'ids', 'tables')
    configured_tables = get_known_tables(deid_tables_path)
    tables = get_output_tables(args.input_dataset, known_tables,
                               args.skip_tables, args.tables)

    exceptions = []
    successes = []
    for table in tables:
        tablepath = None
        if table in configured_tables:
            tablepath = os.path.join(deid_tables_path, table + '.json')
        else:
            tablepath = table

        parameter_list = [
            '--rules',
            os.path.join(DEID_PATH, 'config', 'ids',
                         'config.json'), '--private_key', args.private_key,
            '--table', tablepath, '--action', args.action, '--idataset',
            args.input_dataset, '--log', LOGS_PATH
        ]

        if args.interactive_mode:
            parameter_list.append('--interactive')

        field_names = [field.get('name') for field in fields_for(table)]
        if 'person_id' in field_names:
            parameter_list.append('--cluster')

        LOGGER.info('Executing deid with:\n\tpython deid/aou.py %s',
                    ' '.join(parameter_list))

        try:
            aou.main(parameter_list)
        except google.api_core.exceptions.GoogleAPIError:
            LOGGER.exception("Encountered deid exception:\n")
            exceptions.append(table)
        else:
            LOGGER.info('Successfully executed deid on table: %s', table)
            successes.append(table)

    copy_suppressed_table_schemas(known_tables, args.input_dataset + '_deid')

    LOGGER.info('Deid has finished.  Successfully executed on tables:  %s',
                '\n'.join(successes))
    for exc in exceptions:
        LOGGER.error(
            "Deid encountered exceptions when processing table: %s"
            ".  Fix problems and re-run deid for table if needed.", exc)
Exemple #27
0
    def get_query_specs(self, *args, **keyword_args) -> query_spec_list:
        """
        Return a list of dictionary query specifications.

        :return:  A list of dictionaries. Each dictionary contains a single query
            and a specification for how to execute that query. The specifications
            are optional but the query is required.
        """
        queries = []
        sandbox_queries = []

        for table in table_dates:
            # create sandbox queries
            sandbox_query = dict()
            sandbox_query[cdr_consts.QUERY] = SANDBOX_BAD_END_DATES.render(
                project_id=self.project_id,
                sandbox_id=self.sandbox_dataset_id,
                intermediary_table=self.sandbox_table_for(table),
                dataset_id=self.dataset_id,
                table=table,
                table_end_date=table_dates[table][1],
                table_start_date=table_dates[table][0])
            sandbox_queries.append(sandbox_query)

            fields = resources.fields_for(table)
            # Generate column expressions for select
            col_exprs = [
                'r.' + field['name'] if field['name'] == table_dates[table][1]
                else 'l.' + field['name'] for field in fields
            ]
            cols = ', '.join(col_exprs)
            query = dict()
            query[cdr_consts.QUERY] = NULL_BAD_END_DATES.render(
                project_id=self.project_id,
                dataset_id=self.dataset_id,
                cols=cols,
                table=table,
                table_start_date=table_dates[table][0],
                table_end_date=table_dates[table][1])
            queries.append(query)
        # Sandbox query for visit_occurrence
        sandbox_vo_query = dict()
        sandbox_vo_query[cdr_consts.QUERY] = SANDBOX_BAD_END_DATES.render(
            project_id=self.project_id,
            sandbox_id=self.sandbox_dataset_id,
            intermediary_table=self.sandbox_table_for(visit_occurrence),
            dataset_id=self.dataset_id,
            table=visit_occurrence,
            table_end_date=visit_occurrence_dates[visit_occurrence][1],
            table_start_date=visit_occurrence_dates[visit_occurrence][0])
        sandbox_queries.append(sandbox_vo_query)
        vo_query = dict()
        vo_query[cdr_consts.QUERY] = POPULATE_VISIT_END_DATES.render(
            project_id=self.project_id,
            dataset_id=self.dataset_id,
            placeholder_date=placeholder_date)
        queries.append(vo_query)
        return sandbox_queries + queries
Exemple #28
0
def _get_field_names(tablename):
    try:
        fields = fields_for(tablename)
    except RuntimeError:
        additional_fields = []
    else:
        additional_fields = [f.get('name') for f in fields]

    return additional_fields
Exemple #29
0
def get_table_count_query(dataset_id, table_ids, where):
    queries = []
    for table_id in table_ids:
        if table_id == '_ehr_consent' or 'person_id' in resources.fields_for(
                table_id):
            queries.append(table_count_query(dataset_id, table_id, where))
        else:
            queries.append(table_count_query(dataset_id, table_id, where=''))
    return queries
    def get_field_names(self, table):
        """
        This method gets the list of field names in a table affected by the cleaning rule

        :param table: single table in the list of affected tables
        :return: list of field names in a single affected table
        """
        field_names = [field['name'] for field in resources.fields_for(table)]
        return field_names