Example #1
0
    def setUpClass(cls):
        print('**************************************************************')
        print(cls.__name__)
        print('**************************************************************')

        super().initialize_class_vars()

        # Set the test project identifier
        project_id = os.environ.get(PROJECT_ID)
        cls.project_id = project_id

        # Set the expected test datasets
        dataset_id = bq_utils.get_combined_deid_dataset_id()
        cls.dataset_id = dataset_id
        sandbox_id = dataset_id + '_sandbox'
        cls.sandbox_id = sandbox_id
        rule_instance = CleanHeightAndWeight(project_id, dataset_id,
                                             sandbox_id)
        cls.rule_instance = rule_instance
        for table_name in rule_instance.get_sandbox_tablenames():
            cls.fq_sandbox_table_names.append(
                f'{project_id}.{sandbox_id}.{table_name}')
        cls.fq_table_names = [
            f'{project_id}.{dataset_id}.person',
            f'{project_id}.{dataset_id}.measurement',
            f'{project_id}.{dataset_id}.measurement_ext',
            f'{project_id}.{dataset_id}.condition_occurrence',
            f'{project_id}.{dataset_id}.concept',
            f'{project_id}.{dataset_id}.concept_ancestor',
        ]
        super().setUpClass()
Example #2
0
def clean_combined_de_identified_dataset(project_id=None, dataset_id=None):
    """
    Run all clean rules defined for the deidentified ehr and rdr dataset.

    :param project_id:  Name of the BigQuery project.
    :param dataset_id:  Name of the dataset to clean
    """
    if project_id is None:
        project_id = app_identity.get_application_id()
        LOGGER.info('Project is unspecified.  Using default value of:\t%s',
                    project_id)

    if dataset_id is None:
        dataset_id = bq_utils.get_combined_deid_dataset_id()
        LOGGER.info('Dataset is unspecified.  Using default value of:\t%s',
                    dataset_id)

    sandbox_dataset_id = sandbox.create_sandbox_dataset(project_id=project_id,
                                                        dataset_id=dataset_id)

    query_list = _gather_combined_de_identified_queries(
        project_id, dataset_id, sandbox_dataset_id)

    LOGGER.info("Cleaning de-identified dataset")
    clean_engine.clean_dataset(project_id, query_list, stage.DEID_BASE)
Example #3
0
def clean_ehr_rdr_de_identified_dataset(project=None, dataset=None):
    if dataset is None or dataset == '' or dataset.isspace():
        dataset = bq_utils.get_combined_deid_dataset_id()
        LOGGER.info('Dataset is unspecified.  Using default value of:\t%s',
                    dataset)

    query_list = _gather_ehr_rdr_de_identified_queries(project, dataset)

    LOGGER.info("Cleaning de-identified dataset")
    clean_engine.clean_dataset(project, dataset, query_list)
Example #4
0
def get_dataset_and_project_names():
    """
    Get project and dataset names from environment variables.

    :return: A dictionary of dataset names and project name
    """
    project_and_dataset_names = dict()
    project_and_dataset_names[clean_cdr_consts.EHR_DATASET] = bq_utils.get_dataset_id()
    project_and_dataset_names[clean_cdr_consts.UNIONED_EHR_DATASET] = bq_utils.get_unioned_dataset_id()
    project_and_dataset_names[clean_cdr_consts.RDR_DATASET] = bq_utils.get_rdr_dataset_id()
    project_and_dataset_names[clean_cdr_consts.EHR_RDR_DATASET] = bq_utils.get_ehr_rdr_dataset_id()
    project_and_dataset_names[clean_cdr_consts.EHR_RDR_DE_IDENTIFIED] = bq_utils.get_combined_deid_dataset_id()
    project_and_dataset_names[clean_cdr_consts.PROJECT] = app_identity.get_application_id()

    return project_and_dataset_names
Example #5
0
def clean_ehr_rdr_de_identified_dataset(project_id=None, dataset_id=None):
    """
    Run all clean rules defined for the deidentified ehr and rdr dataset.

    :param project_id:  Name of the BigQuery project.
    :param dataset_id:  Name of the dataset to clean
    """
    if dataset_id is None or dataset_id == '' or dataset_id.isspace():
        dataset_id = bq_utils.get_combined_deid_dataset_id()
        LOGGER.info('Dataset is unspecified.  Using default value of:\t%s', dataset_id)

    query_list = _gather_ehr_rdr_de_identified_queries(project_id, dataset_id)

    LOGGER.info("Cleaning de-identified dataset")
    clean_engine.clean_dataset(project_id, dataset_id, query_list)