Esempio n. 1
0
def run_bq_retraction(project_id, sandbox_dataset_id, pid_project_id,
                      pid_table_id, hpo_id, dataset_ids_list, retraction_type):
    """
    Main function to perform retraction
    pid table must follow schema described above in PID_TABLE_FIELDS and must reside in sandbox_dataset_id
    This function removes rows from all tables containing person_ids if they exist in pid_table_id

    :param project_id: project to retract from
    :param sandbox_dataset_id: identifies the dataset containing the pid table
    :param pid_project_id: identifies the dataset containing the sandbox dataset
    :param pid_table_id: table containing the person_ids and research_ids
    :param hpo_id: hpo_id of the site to retract from
    :param dataset_ids_list: list of datasets to retract from separated by a space. If containing only 'all_datasets',
        retracts from all datasets. If containing only 'none', skips retraction from BigQuery datasets
    :param retraction_type: string indicating whether all data needs to be removed, including RDR,
        or if RDR data needs to be kept intact. Can take the values 'rdr_and_ehr' or 'only_ehr'
    :return:
    """
    client = bq.get_client(project_id)
    dataset_ids = ru.get_datasets_list(project_id, dataset_ids_list)

    queries = []
    for dataset in dataset_ids:
        if ru.is_deid_dataset(dataset):
            LOGGER.info(f"Retracting from DEID dataset {dataset}")
            research_id_query = JINJA_ENV.from_string(PERSON_ID_QUERY).render(
                person_research_id=RESEARCH_ID,
                pid_project=pid_project_id,
                sandbox_dataset_id=sandbox_dataset_id,
                pid_table_id=pid_table_id)
            queries = queries_to_retract_from_dataset(client, project_id,
                                                      dataset,
                                                      research_id_query,
                                                      retraction_type)
        else:
            person_id_query = JINJA_ENV.from_string(PERSON_ID_QUERY).render(
                person_research_id=PERSON_ID,
                pid_project=pid_project_id,
                sandbox_dataset_id=sandbox_dataset_id,
                pid_table_id=pid_table_id)
            if ru.is_combined_dataset(dataset):
                LOGGER.info(f"Retracting from Combined dataset {dataset}")
                queries = queries_to_retract_from_dataset(
                    client, project_id, dataset, person_id_query)
            elif ru.is_unioned_dataset(dataset):
                LOGGER.info(f"Retracting from Unioned dataset {dataset}")
                queries = queries_to_retract_from_dataset(
                    client, project_id, dataset, person_id_query)
            elif ru.is_ehr_dataset(dataset):
                if hpo_id == NONE_STR:
                    LOGGER.info(
                        f'"RETRACTION_HPO_ID" set to "{NONE_STR}", skipping retraction from {dataset}'
                    )
                else:
                    LOGGER.info(f"Retracting from EHR dataset {dataset}")
                    queries = queries_to_retract_from_ehr_dataset(
                        client, project_id, dataset, hpo_id, person_id_query)
        retraction_query_runner(client, queries)
    LOGGER.info('Retraction complete')
    return
Esempio n. 2
0
def main(args=None):
    pipeline_logging.configure(logging.DEBUG, add_console_handler=True)
    parser = get_parser()
    args = parser.parse_args(args)
    client = bq.get_client(args.project_id)
    dataset_ids = ru.get_datasets_list(args.project_id, args.dataset_ids)
    LOGGER.info(
        f"Datasets to retract deactivated participants from: {dataset_ids}")
    run_deactivation(client, args.project_id, dataset_ids, args.fq_deact_table,
                     args.fq_pid_rid_table)
    LOGGER.info(
        f"Retraction of deactivated participants from {dataset_ids} complete")
Esempio n. 3
0
    def test_get_datasets_list(self, mock_get_client):
        #pre-conditions
        removed_datasets = [
            data_ref('foo', 'vocabulary20201010'),
            data_ref('foo', 'R2019q4r1_deid_sandbox')
        ]
        expected_datasets = [
            data_ref('foo', '2021q1r1_rdr'),
            data_ref('foo', 'C2020q1r1_deid'),
            data_ref('foo', 'R2019q4r1_deid'),
            data_ref('foo', '2018q4r1_rdr')
        ]
        expected_list = [dataset.dataset_id for dataset in expected_datasets]
        mock_client = mock.MagicMock()
        mock_get_client.return_value = mock_client
        mock_client.list_datasets.return_value = removed_datasets + expected_datasets

        # test all_datasets flag
        ds_list = ru.get_datasets_list('foo', ['all_datasets'])

        # post conditions
        self.assertCountEqual(expected_list, ds_list)

        # test specific dataset
        ds_list = ru.get_datasets_list('foo', ['C2020q1r1_deid'])

        # post conditions
        self.assertEqual(['C2020q1r1_deid'], ds_list)

        # test None dataset
        ds_list = ru.get_datasets_list('foo', None)

        # post conditions
        self.assertEqual([], ds_list)

        # test empty list dataset
        ds_list = ru.get_datasets_list('foo', [])

        # post conditions
        self.assertEqual([], ds_list)
Esempio n. 4
0
def run_bq_retraction(project_id, sandbox_dataset_id, pid_project_id,
                      pid_table_id, hpo_id, dataset_ids_str, retraction_type):
    """
    Main function to perform retraction
    pid table must follow schema described above in PID_TABLE_FIELDS and must reside in sandbox_dataset_id
    This function removes rows from all tables containing person_ids if they exist in pid_table_id

    :param project_id: project to retract from
    :param sandbox_dataset_id: identifies the dataset containing the pid table
    :param pid_project_id: identifies the dataset containing the sandbox dataset
    :param pid_table_id: table containing the person_ids and research_ids
    :param hpo_id: hpo_id of the site to retract from
    :param dataset_ids_str: string of datasets to retract from separated by a space. If set to 'all_datasets',
        retracts from all datasets. If set to 'none', skips retraction from BigQuery datasets
    :param retraction_type: string indicating whether all data needs to be removed, including RDR,
        or if RDR data needs to be kept intact. Can take the values 'rdr_and_ehr' or 'only_ehr'
    :return:
    """
    dataset_ids = ru.get_datasets_list(project_id, dataset_ids_str)

    deid_datasets = []
    combined_datasets = []
    unioned_datasets = []
    ehr_datasets = []
    for dataset in dataset_ids:
        if ru.is_deid_dataset(dataset):
            deid_datasets.append(dataset)
        elif ru.is_combined_dataset(dataset):
            combined_datasets.append(dataset)
        elif ru.is_unioned_dataset(dataset):
            unioned_datasets.append(dataset)
        elif ru.is_ehr_dataset(dataset):
            ehr_datasets.append(dataset)

    # skip ehr datasets if hpo_id is indicated as none
    if hpo_id == 'none':
        LOGGER.info(
            '"RETRACTION_HPO_ID" set to "none", skipping retraction from EHR datasets'
        )
        ehr_datasets = []

    LOGGER.info(f"Retracting from EHR datasets: {', '.join(ehr_datasets)}")
    for dataset in ehr_datasets:
        ehr_mapping_queries, ehr_queries = queries_to_retract_from_ehr_dataset(
            project_id, dataset, pid_project_id, sandbox_dataset_id, hpo_id,
            pid_table_id)
        retraction_query_runner(ehr_mapping_queries)
        retraction_query_runner(ehr_queries)
    LOGGER.info('Finished retracting from EHR datasets')

    LOGGER.info(
        f"Retracting from UNIONED datasets: {', '.join(unioned_datasets)}")
    for dataset in unioned_datasets:
        unioned_mapping_queries, unioned_queries = queries_to_retract_from_unioned_dataset(
            project_id, dataset, pid_project_id, sandbox_dataset_id,
            pid_table_id)
        retraction_query_runner(unioned_mapping_queries)
        retraction_query_runner(unioned_queries)
    LOGGER.info('Finished retracting from UNIONED datasets')

    LOGGER.info(
        f"Retracting from COMBINED datasets: {', '.join(combined_datasets)}")
    for dataset in combined_datasets:
        combined_mapping_queries, combined_queries = queries_to_retract_from_combined_or_deid_dataset(
            project_id,
            dataset,
            pid_project_id,
            sandbox_dataset_id,
            pid_table_id,
            retraction_type,
            deid_flag=False)
        retraction_query_runner(combined_mapping_queries)
        retraction_query_runner(combined_queries)
    LOGGER.info('Finished retracting from COMBINED datasets')

    # TODO ensure the correct research_ids for persons_ids are used for each deid retraction
    LOGGER.info(f"Retracting from DEID datasets: {', '.join(deid_datasets)}")
    for dataset in deid_datasets:
        deid_mapping_queries, deid_queries = queries_to_retract_from_combined_or_deid_dataset(
            project_id,
            dataset,
            pid_project_id,
            sandbox_dataset_id,
            pid_table_id,
            retraction_type,
            deid_flag=True)
        retraction_query_runner(deid_mapping_queries)
        retraction_query_runner(deid_queries)
    LOGGER.info('Finished retracting from DEID datasets')
Esempio n. 5
0
        required=True)
    parser.add_argument('-b',
                        '--sandbox_dataset_id',
                        action='store',
                        dest='sandbox_dataset_id',
                        help='Identifies sandbox dataset to store records',
                        required=True)
    args = parser.parse_args()

    pipeline_logging.configure(level=logging.DEBUG,
                               add_console_handler=args.console_log)

    client = bq.get_client(args.project_id)

    # keep only datasets existing in project
    dataset_ids = ru.get_datasets_list(args.project_id, [args.dataset_id])

    # dataset_ids should contain only one dataset (unioned_ehr)
    if len(dataset_ids) == 1:
        dataset_id = dataset_ids[0]
    else:
        raise RuntimeError(f'More than one dataset specified: {dataset_ids}')

    LOGGER.info(
        f"Dataset to retract deactivated participants from: {dataset_id}. "
        f"Using sandbox dataset: {args.sandbox_dataset_id}")

    deactivation_queries = remove_ehr_data_queries(client, args.api_project_id,
                                                   args.project_id, dataset_id,
                                                   args.sandbox_dataset_id)