def setUp(self):
        self.project_id = bq_utils.app_identity.get_application_id()
        self.dataset_id = bq_utils.get_combined_dataset_id()
        self.sandbox_dataset_id = bq_utils.get_unioned_dataset_id()
        if not self.project_id or not self.dataset_id:
            # TODO: Fix handling of globals, push these assertions down if they are required.
            raise ValueError(
                f"missing configuration for project ('{self.project_id}') " +
                f"and/or dataset ('{self.dataset_id}')")

        # TODO: Reconcile this with a consistent integration testing model. Ideally each test should
        # clean up after itself so that we don't need this defensive check.
        test_util.delete_all_tables(self.dataset_id)

        # drop concept table
        drop_concept_table(self.dataset_id)

        create_tables = ['person', 'observation']
        table_fields = {
            'person': 'post_deid_person',
            'observation': 'observation',
            'concept': 'concept'
        }
        for tbl in ['concept']:
            if not bq_utils.table_exists(tbl, dataset_id=self.dataset_id):
                create_tables.append(tbl)
        for tbl in create_tables:
            bq_utils.create_standard_table(table_fields[tbl],
                                           tbl,
                                           dataset_id=self.dataset_id,
                                           force_all_nullable=True)
Beispiel #2
0
def remove_ehr_data_queries(project_id, ticket_number, pids_project_id,
                            pids_dataset_id, tablename):
    """
    Creates sandboxes and drops all EHR data found for deactivated participants after
    their deactivation date

    :param project_id: BQ name of the project
    :param ticket_number: Jira ticket number to identify and title sandbox tables
    :param pids_project_id: deactivated participants PIDs table in BQ's project_id
    :param pids_dataset_id: deactivated participants PIDs table in BQ's dataset_id
    :param tablename: The name of the table to house the deactivated participant data
    """

    ehr_union_dataset = bq_utils.get_unioned_dataset_id()

    # gets the deactivated participant dataset to ensure it's up-to-date
    df = psr.get_deactivated_participants(pids_project_id, pids_dataset_id,
                                          tablename,
                                          DEACTIVATED_PARTICIPANTS_COLUMNS)
    # To store dataframe in a BQ dataset table
    destination_table = pids_dataset_id + '.' + tablename
    psr.store_participant_data(df, project_id, destination_table)
    # creates sandbox and truncate queries to run for deactivated participant data drops
    queries = rdp.create_queries(
        project_id,
        ticket_number=ticket_number,
        # the deactivated participants table is stored in the same project
        # as the data being retracted
        pids_project_id=project_id,
        pids_dataset_id=pids_dataset_id,
        pids_table=tablename,
        datasets=[ehr_union_dataset])

    return queries
Beispiel #3
0
    def setUp(self):
        self.testbed = testbed.Testbed()
        self.testbed.activate()
        self.testbed.init_app_identity_stub()
        self.testbed.init_memcache_stub()
        self.testbed.init_urlfetch_stub()
        self.testbed.init_blobstore_stub()
        self.testbed.init_datastore_v3_stub()
        self.project_id = bq_utils.app_identity.get_application_id()
        self.hpo_ids = [NYC_HPO_ID, PITT_HPO_ID]
        self.input_dataset_id = bq_utils.get_dataset_id()
        self.output_dataset_id = bq_utils.get_unioned_dataset_id()
        self._empty_hpo_buckets()
        test_util.delete_all_tables(self.input_dataset_id)
        test_util.delete_all_tables(self.output_dataset_id)

        # TODO Generalize to work for all foreign key references
        # Collect all primary key fields in CDM tables
        mapped_fields = []
        for table in cdm.tables_to_map():
            field = table + '_id'
            mapped_fields.append(field)
        self.mapped_fields = mapped_fields
        self.implemented_foreign_keys = [
            eu_constants.VISIT_OCCURRENCE_ID, eu_constants.CARE_SITE_ID,
            eu_constants.LOCATION_ID
        ]
Beispiel #4
0
def clean_unioned_ehr_dataset(project_id=None, dataset_id=None):
    """
    Run all clean rules defined for the unioned ehr dataset.

    :param project_id:  Name of the BigQuery project.
    :param dataset_id:  Name of the dataset to clean
    """
    if project_id is None:
        project_id = app_identity.get_application_id()
        LOGGER.info('Project is unspecified.  Using default value of:\t%s',
                    project_id)

    if dataset_id is None:
        dataset_id = bq_utils.get_unioned_dataset_id()
        LOGGER.info('Dataset is unspecified.  Using default value of:\t%s',
                    dataset_id)

    sandbox_dataset_id = sandbox.create_sandbox_dataset(project_id=project_id,
                                                        dataset_id=dataset_id)

    query_list = _gather_unioned_ehr_queries(project_id, dataset_id,
                                             sandbox_dataset_id)

    LOGGER.info("Cleaning unioned_dataset")
    clean_engine.clean_dataset(project_id, query_list, stage.UNIONED)
def delete_records_for_non_matching_participants(project_id,
                                                 dataset_id,
                                                 sandbox_dataset_id=None,
                                                 ehr_dataset_id=None,
                                                 validation_dataset_id=None):
    """
    This function generates the queries that delete participants and their corresponding data points, for which the 
    participant_match data is missing and DRC matching algorithm flags it as a no match 
    
    :param project_id: 
    :param dataset_id: 
    :param ehr_dataset_id: 
    :param sandbox_dataset_id: Identifies the sandbox dataset to store rows 
    #TODO use sandbox_dataset_id for CR
    :param validation_dataset_id:

    :return: 
    """

    if ehr_dataset_id is None:
        ehr_dataset_id = bq_utils.get_unioned_dataset_id()

    if validation_dataset_id is None:
        validation_dataset_id = bq.get_latest_validation_dataset_id(project_id)

    non_matching_person_ids = []

    # Retrieving all hpo_ids
    for hpo_id in readers.get_hpo_site_names():
        if not exist_participant_match(ehr_dataset_id, hpo_id):
            LOGGER.info(
                'The hpo site {hpo_id} is missing the participant_match data'.
                format(hpo_id=hpo_id))

            non_matching_person_ids.extend(
                get_list_non_match_participants(project_id,
                                                validation_dataset_id, hpo_id))
        else:
            LOGGER.info(
                'The hpo site {hpo_id} submitted the participant_match data'.
                format(hpo_id=hpo_id))

    queries = []

    if non_matching_person_ids:
        LOGGER.info(
            'Participants: {person_ids} and their data will be dropped from {combined_dataset_id}'
            .format(person_ids=non_matching_person_ids,
                    combined_dataset_id=dataset_id))

        queries.append(
            remove_pids.get_sandbox_queries(project_id, dataset_id,
                                            non_matching_person_ids,
                                            TICKET_NUMBER))
        queries.extend(
            remove_pids.get_remove_pids_queries(project_id, dataset_id,
                                                non_matching_person_ids))

    return queries
Beispiel #6
0
 def setUp(self):
     self.hpo_id = 'fake'
     self.project_id = 'fake-project-id'
     self.test_project_id = app_identity.get_application_id()
     self.pid_table_id = 'pid_table'
     self.bq_dataset_id = bq_utils.get_unioned_dataset_id()
     self.dataset_ids = 'all_datasets'
     self.person_research_ids = [(1, 6890173), (2, 858761),
                                 (1234567, 4589763)]
Beispiel #7
0
def clean_unioned_ehr_dataset(project=None, dataset=None):
    if dataset is None or dataset == '' or dataset.isspace():
        dataset = bq_utils.get_unioned_dataset_id()
        LOGGER.info('Dataset is unspecified.  Using default value of:\t%s',
                    dataset)

    query_list = _gather_unioned_ehr_queries(project, dataset)

    LOGGER.info("Cleaning unioned_dataset")
    clean_engine.clean_dataset(project, dataset, query_list)
 def setUp(self):
     self.hpo_id = 'fake'
     self.project_id = 'fake-project-id'
     self.test_project_id = app_identity.get_application_id()
     self.ehr_dataset_id = 'ehr20190801_fake'
     self.unioned_dataset_id = 'unioned_ehr20190801'
     self.combined_dataset_id = 'combined20190801'
     self.bq_dataset_id = bq_utils.get_unioned_dataset_id()
     self.person_ids = [1, 2, 1234567]
     self.tables_to_retract_unioned = retract_data_bq.TABLES_FOR_RETRACTION | {common.FACT_RELATIONSHIP, common.PERSON}
     self.tables_to_retract_combined = retract_data_bq.TABLES_FOR_RETRACTION | {common.FACT_RELATIONSHIP}
     self.all_tables = resources.CDM_TABLES
Beispiel #9
0
def union_ehr():
    hpo_id = 'unioned_ehr'
    app_id = bq_utils.app_identity.get_application_id()
    input_dataset_id = bq_utils.get_dataset_id()
    output_dataset_id = bq_utils.get_unioned_dataset_id()
    ehr_union.main(input_dataset_id, output_dataset_id, app_id)

    run_achilles(hpo_id)
    now_date_string = datetime.datetime.now().strftime('%Y_%m_%d')
    folder_prefix = 'unioned_ehr_' + now_date_string + '/'
    run_export(datasource_id=hpo_id, folder_prefix=folder_prefix)
    logging.info(f"Uploading achilles index files")
    _upload_achilles_files(hpo_id, folder_prefix)
    return 'merge-and-achilles-done'
 def setUp(self):
     self.hpo_id = test_util.FAKE_HPO_ID
     self.bucket = gcs_utils.get_hpo_bucket(self.hpo_id)
     self.site_bucket = 'test_bucket'
     self.folder_1 = '2019-01-01-v1/'
     self.folder_2 = '2019-02-02-v2/'
     self.folder_prefix_1 = self.hpo_id + '/' + self.site_bucket + '/' + self.folder_1
     self.folder_prefix_2 = self.hpo_id + '/' + self.site_bucket + '/' + self.folder_2
     self.pids = [17, 20]
     self.skip_pids = [10, 25]
     self.project_id = 'project_id'
     self.sandbox_dataset_id = bq_utils.get_unioned_dataset_id()
     self.pid_table_id = 'pid_table'
     self._empty_bucket()
Beispiel #11
0
def clean_unioned_ehr_dataset(project_id=None, dataset_id=None):
    """
    Run all clean rules defined for the unioned ehr dataset.

    :param project_id:  Name of the BigQuery project.
    :param dataset_id:  Name of the dataset to clean
    """
    if dataset_id is None or dataset_id == '' or dataset_id.isspace():
        dataset_id = bq_utils.get_unioned_dataset_id()
        LOGGER.info('Dataset is unspecified.  Using default value of:\t%s', dataset_id)

    query_list = _gather_unioned_ehr_queries(project_id, dataset_id)

    LOGGER.info("Cleaning unioned_dataset")
    clean_engine.clean_dataset(project_id, dataset_id, query_list)
Beispiel #12
0
def get_dataset_and_project_names():
    """
    Get project and dataset names from environment variables.

    :return: A dictionary of dataset names and project name
    """
    project_and_dataset_names = dict()
    project_and_dataset_names[clean_cdr_consts.EHR_DATASET] = bq_utils.get_dataset_id()
    project_and_dataset_names[clean_cdr_consts.UNIONED_EHR_DATASET] = bq_utils.get_unioned_dataset_id()
    project_and_dataset_names[clean_cdr_consts.RDR_DATASET] = bq_utils.get_rdr_dataset_id()
    project_and_dataset_names[clean_cdr_consts.EHR_RDR_DATASET] = bq_utils.get_ehr_rdr_dataset_id()
    project_and_dataset_names[clean_cdr_consts.EHR_RDR_DE_IDENTIFIED] = bq_utils.get_combined_deid_dataset_id()
    project_and_dataset_names[clean_cdr_consts.PROJECT] = app_identity.get_application_id()

    return project_and_dataset_names
Beispiel #13
0
 def setUp(self):
     super(EhrUnionTest, self).setUp()
     self.testbed = testbed.Testbed()
     self.testbed.activate()
     self.testbed.init_app_identity_stub()
     self.testbed.init_memcache_stub()
     self.testbed.init_urlfetch_stub()
     self.testbed.init_blobstore_stub()
     self.testbed.init_datastore_v3_stub()
     self.project_id = bq_utils.app_identity.get_application_id()
     self.hpo_ids = [CHS_HPO_ID, PITT_HPO_ID]
     self.input_dataset_id = bq_utils.get_dataset_id()
     self.output_dataset_id = bq_utils.get_unioned_dataset_id()
     self._empty_hpo_buckets()
     test_util.delete_all_tables(self.input_dataset_id)
     test_util.delete_all_tables(self.output_dataset_id)
    def test_execute_queries(self):
        project_id = bq_utils.app_identity.get_application_id()
        dataset_id = bq_utils.get_combined_dataset_id()
        sandbox_id = bq_utils.get_unioned_dataset_id()
        test_util.delete_all_tables(dataset_id)

        create_tables = (
            ['person'] + common.CLINICAL_DATA_TABLES +
            ['_mapping_' + t for t in common.MAPPED_CLINICAL_DATA_TABLES])
        # TODO(calbach): Make the setup/teardown of these concept tables hermetic.
        for tbl in ['concept', 'concept_ancestor']:
            if not bq_utils.table_exists(tbl, dataset_id=dataset_id):
                create_tables.push(tbl)
        for tbl in create_tables:
            bq_utils.create_standard_table(tbl,
                                           tbl,
                                           dataset_id=dataset_id,
                                           force_all_nullable=True)

        for tmpl in INSERT_FAKE_PARTICIPANTS_TMPLS:
            resp = bq_utils.query(
                tmpl.render(project_id=project_id,
                            dataset_id=dataset_id,
                            rdr_basics_concept_id=123,
                            rdr_consent_concept_id=345,
                            ehr_obs_concept_id=567,
                            rdr_basics_module_concept_id=
                            drop_participants_without_ppi_or_ehr.
                            BASICS_MODULE_CONCEPT_ID))
            self.assertTrue(resp["jobComplete"])

        clean_cdr_engine.clean_dataset(
            project_id, dataset_id, sandbox_id,
            [(drop_participants_without_ppi_or_ehr.get_queries, )])

        def table_to_person_ids(t):
            rows = bq_utils.response2rows(
                bq_utils.query("SELECT person_id FROM `{}.{}.{}`".format(
                    project_id, dataset_id, t)))
            return set([r["person_id"] for r in rows])

        # We expect participants 1, 5 to have been removed from all tables.
        self.assertEqual(set([2, 3, 4, 6]), table_to_person_ids("person"))
        self.assertEqual(set([2, 4, 6]), table_to_person_ids("observation"))
        self.assertEquals(set([3, 4]), table_to_person_ids("drug_exposure"))

        test_util.delete_all_tables(dataset_id)
Beispiel #15
0
    def setUp(self):
        self.project_id = bq_utils.app_identity.get_application_id()
        self.hpo_ids = [PITT_HPO_ID, NYC_HPO_ID, EXCLUDED_HPO_ID]
        self.input_dataset_id = bq_utils.get_dataset_id()
        self.output_dataset_id = bq_utils.get_unioned_dataset_id()
        self.storage_client = StorageClient(self.project_id)
        self.tearDown()

        # TODO Generalize to work for all foreign key references
        # Collect all primary key fields in CDM tables
        mapped_fields = []
        for table in cdm.tables_to_map():
            field = table + '_id'
            mapped_fields.append(field)
        self.mapped_fields = mapped_fields
        self.implemented_foreign_keys = [
            eu_constants.VISIT_OCCURRENCE_ID, eu_constants.VISIT_DETAIL_ID,
            eu_constants.CARE_SITE_ID, eu_constants.LOCATION_ID
        ]
Beispiel #16
0
    def setUp(self):

        self.project_id = bq_utils.app_identity.get_application_id()
        self.hpo_ids = [NYC_HPO_ID, PITT_HPO_ID]
        self.input_dataset_id = bq_utils.get_dataset_id()
        self.output_dataset_id = bq_utils.get_unioned_dataset_id()
        # Done in tearDown().  this is redundant.
        self._empty_hpo_buckets()
        test_util.delete_all_tables(self.input_dataset_id)
        test_util.delete_all_tables(self.output_dataset_id)

        # TODO Generalize to work for all foreign key references
        # Collect all primary key fields in CDM tables
        mapped_fields = []
        for table in cdm.tables_to_map():
            field = table + '_id'
            mapped_fields.append(field)
        self.mapped_fields = mapped_fields
        self.implemented_foreign_keys = [
            eu_constants.VISIT_OCCURRENCE_ID, eu_constants.CARE_SITE_ID,
            eu_constants.LOCATION_ID
        ]
 def setUp(self):
     self.bq_project_id = app_identity.get_application_id()
     self.bq_dataset_id = bq_utils.get_unioned_dataset_id()