Beispiel #1
0
def remove_ehr_data_queries(project_id, ticket_number, pids_project_id,
                            pids_dataset_id, tablename):
    """
    Creates sandboxes and drops all EHR data found for deactivated participants after
    their deactivation date

    :param project_id: BQ name of the project
    :param ticket_number: Jira ticket number to identify and title sandbox tables
    :param pids_project_id: deactivated participants PIDs table in BQ's project_id
    :param pids_dataset_id: deactivated participants PIDs table in BQ's dataset_id
    :param tablename: The name of the table to house the deactivated participant data
    """

    ehr_union_dataset = bq_utils.get_unioned_dataset_id()

    # gets the deactivated participant dataset to ensure it's up-to-date
    df = psr.get_deactivated_participants(pids_project_id, pids_dataset_id,
                                          tablename,
                                          DEACTIVATED_PARTICIPANTS_COLUMNS)
    # To store dataframe in a BQ dataset table
    destination_table = pids_dataset_id + '.' + tablename
    psr.store_participant_data(df, project_id, destination_table)
    # creates sandbox and truncate queries to run for deactivated participant data drops
    queries = rdp.create_queries(
        project_id,
        ticket_number=ticket_number,
        # the deactivated participants table is stored in the same project
        # as the data being retracted
        pids_project_id=project_id,
        pids_dataset_id=pids_dataset_id,
        pids_table=tablename,
        datasets=[ehr_union_dataset])

    return queries
    def setup_rule(self, client):
        """
        Responsible for grabbing and storing deactivated participant data.

        :param client: client object passed to store the data
        """
        LOGGER.info("Querying RDR API for deactivated participant data")
        # gets the deactivated participant dataset to ensure it's up-to-date
        df = psr.get_deactivated_participants(self.api_project_id,
                                              DEACTIVATED_PARTICIPANTS_COLUMNS)

        LOGGER.info(f"Found '{len(df)}' deactivated participants via RDR API")

        # To store dataframe in a BQ dataset table named _deactivated_participants
        psr.store_participant_data(df, self.project_id, self.destination_table)

        LOGGER.info(f"Finished storing participant records in: "
                    f"`{self.destination_table}`")

        LOGGER.debug("instantiating class client object")
        self.client = client

        # reinitializing self.affected_tables
        LOGGER.debug(
            "reinitializing self.affected_tables to actual tables available")
        tables_list = self.client.list_tables(self.dataset_id)
        self.affected_tables = [
            table_item.table_id for table_item in tables_list
        ]
Beispiel #3
0
def remove_ehr_data_queries(client, api_project_id, project_id, dataset_id,
                            sandbox_dataset_id):
    """
    Sandboxes and drops all EHR data found for deactivated participants after their deactivation date

    :param client: BQ client
    :param api_project_id: Project containing the RDR Participant Summary API
    :param project_id: Identifies the project containing the target dataset
    :param dataset_id: Identifies the dataset to retract deactivated participants from
    :param sandbox_dataset_id: Identifies the sandbox dataset to store records for dataset_id
    :returns queries: List of query dictionaries
    """
    # gets the deactivated participant dataset to ensure it's up-to-date
    df = psr.get_deactivated_participants(api_project_id,
                                          DEACTIVATED_PARTICIPANTS_COLUMNS)

    # To store dataframe in a BQ dataset table named _deactivated_participants
    destination_table = f'{sandbox_dataset_id}.{DEACTIVATED_PARTICIPANTS}'
    psr.store_participant_data(df, project_id, destination_table)

    fq_deact_table = f'{project_id}.{destination_table}'
    deact_table_ref = gbq.TableReference.from_string(f"{fq_deact_table}")
    LOGGER.info(f"Retracting deactivated participants from '{dataset_id}'")
    LOGGER.info(
        f"Using sandbox dataset '{sandbox_dataset_id}' for '{dataset_id}'")
    # creates sandbox and truncate queries to run for deactivated participant data drops
    queries = rdp.generate_queries(client, project_id, dataset_id,
                                   sandbox_dataset_id, deact_table_ref)
    return queries
Beispiel #4
0
def main(project_id,
         rdr_project_id,
         org_id=None,
         hpo_id=None,
         dataset_id=DRC_OPS):

    #Get list of hpos
    LOGGER.info('Getting hpo list...')
    if org_id:
        hpo_list = [{"hpo_id": hpo_id, "org_id": org_id}]
    else:
        hpo_list = get_hpo_info(project_id)

    LOGGER.info(hpo_list)

    for hpo in hpo_list:
        org_id = hpo['org_id']
        hpo_id = hpo['hpo_id']
        # Get participant summary data
        LOGGER.info(f'Getting participant summary data for {org_id}...')
        participant_info = get_org_participant_information(
            rdr_project_id, org_id)

        # Load schema and create ingestion time-partitioned table

        schema = bq.get_table_schema(PS_API_VALUES)
        tablename = f'{PS_API_VALUES}_{hpo_id}'

        client = bq.get_client(project_id)
        try:
            table = client.get_table(f'{project_id}.{dataset_id}.{tablename}')
        except NotFound:
            LOGGER.info(
                f'Creating table {project_id}.{dataset_id}.{tablename}...')

            table = bigquery.Table(f'{project_id}.{dataset_id}.{tablename}',
                                   schema=schema)
            table.time_partitioning = bigquery.TimePartitioning(
                type_=bigquery.TimePartitioningType.HOUR)
            table = client.create_table(table)

        # Insert summary data into table
        LOGGER.info(
            f'Storing participant data for {org_id} in table {project_id}.{dataset_id}.{tablename}...'
        )
        store_participant_data(participant_info,
                               project_id,
                               f'{dataset_id}.{tablename}',
                               schema=schema)

    LOGGER.info(f'Done.')
    def test_store_participant_data(self):
        # Parameter check test
        self.assertRaises(RuntimeError, psr.store_participant_data,
                          self.fake_dataframe, None, self.destination_table)

        # Test
        psr.store_participant_data(self.fake_dataframe, self.project_id,
                                   self.destination_table)

        # Post conditions
        values = [(111, 'NO_CONTACT', datetime.date(2018, 12, 7)),
                  (222, 'NO_CONTACT', datetime.date(2018, 12, 7))]
        self.assertTableValuesMatch(
            '.'.join([self.project_id, self.destination_table]),
            self.bq_columns, values)
Beispiel #6
0
    def test_store_participant_data(self, mock_to_gbq, mock_fields_for):
        # pre conditions
        table_schema = [{
            "type": "INTEGER",
            "name": "participantId"
        }, {
            "type": "STRING",
            "name": "suspensionStatus"
        }, {
            "type": "TIMESTAMP",
            "name": "suspensionTime"
        }]
        mock_fields_for.return_value = table_schema

        # parameter check test
        self.assertRaises(RuntimeError, psr.store_participant_data,
                          self.fake_dataframe, None, self.destination_table)

        # test
        results = psr.store_participant_data(self.fake_dataframe,
                                             self.project_id,
                                             self.destination_table)

        # post condition
        self.assertEqual(
            mock_to_gbq(self.fake_dataframe,
                        self.destination_table,
                        self.project_id,
                        if_exists='append',
                        table_schema=mock_fields_for), results)
    def test_get_deactivated_participants(self, mock_get):
        # Pre conditions
        mock_get.return_value.status_code = 200
        mock_get.return_value.json.return_value = self.json_response_entry

        # Tests
        df = psr.get_deactivated_participants(self.project_id, self.columns)

        psr.store_participant_data(df, self.project_id,
                                   f'{self.dataset_id}.{self.tablename}')

        # Post conditions
        values = [(111, 'NO_CONTACT', datetime.date(2018, 12, 7)),
                  (222, 'NO_CONTACT', datetime.date(2018, 12, 7))]
        self.assertTableValuesMatch(
            '.'.join([self.project_id, self.destination_table]),
            self.bq_columns, values)
    def test_store_participant_data(self, mock_load_job_config,
                                    mock_bq_get_client):
        fake_job_id = 'fake_job_id'

        mock_load_job = MagicMock()
        mock_load_job.result = MagicMock(return_value=None)
        mock_load_job.job_id = fake_job_id

        mock_bq_client = MagicMock()
        mock_bq_get_client.return_value = mock_bq_client
        mock_bq_client.load_table_from_dataframe = MagicMock(
            return_value=mock_load_job)

        mock_load_config = MagicMock()
        mock_load_job_config.return_value = mock_load_config

        # parameter check test
        self.assertRaises(RuntimeError, psr.store_participant_data,
                          self.fake_dataframe, None, self.destination_table)

        # test
        actual_job_id = psr.store_participant_data(
            self.fake_dataframe,
            self.project_id,
            self.destination_table,
            schema=psr.get_table_schema(PS_API_VALUES))

        mock_bq_get_client.assert_called_once_with(self.project_id)
        mod_fake_dataframe = psr.set_dataframe_date_fields(
            self.fake_dataframe, psr.get_table_schema(PS_API_VALUES))
        # mock_bq_client.load_table_from_dataframe.assert_called_once_with(
        #     mod_fake_dataframe, self.destination_table, job_config=mock_load_config)
        pandas.testing.assert_frame_equal(
            mock_bq_client.load_table_from_dataframe.call_args[0][0],
            mod_fake_dataframe)
        mock_load_job_config.assert_called_once_with(
            schema=psr.get_table_schema(PS_API_VALUES))
        mock_load_job.result.assert_called_once_with()
        self.assertEqual(actual_job_id, fake_job_id)
Beispiel #9
0
    def test_get_deactivated_participants(self,
                                          mock_get_deactivated_participants,
                                          mock_store_participant_data):

        # pre conditions
        mock_get_deactivated_participants.return_value = self.fake_dataframe

        # tests
        dataframe_response = psr.get_deactivated_participants(
            self.project_id, self.dataset_id, self.tablename, self.columns)

        dataset_response = psr.store_participant_data(dataframe_response,
                                                      self.project_id,
                                                      self.destination_table)
        expected_response = mock_store_participant_data(
            dataframe_response, self.project_id, self.destination_table)

        # post conditions
        pandas.testing.assert_frame_equal(
            dataframe_response,
            pandas.DataFrame(self.updated_deactivated_participants,
                             columns=self.columns))

        self.assertEqual(expected_response, dataset_response)
    def test_remove_ehr_data_past_deactivation_date(self, mock_retraction_info,
                                                    mock_get):
        # pre conditions for participant summary API module
        mock_get.return_value.status_code = 200
        mock_get.return_value.json.return_value = self.json_response_entry

        # Ensure deactivated participants table is created and or updated
        df = psr.get_deactivated_participants(self.project_id, self.dataset_id,
                                              self.tablename, self.columns)
        psr.store_participant_data(df, self.project_id,
                                   f'{self.dataset_id}.{self.tablename}')
        # pre conditions for retraction module
        d = {
            'project_id': [
                self.project_id, self.project_id, self.project_id,
                self.project_id, self.project_id, self.project_id
            ],
            'dataset_id': [
                self.dataset_id, self.dataset_id, self.dataset_id,
                self.dataset_id, self.dataset_id, self.dataset_id
            ],
            'table': [
                'condition_occurrence', 'drug_exposure', 'measurement',
                'observation', 'procedure_occurrence', 'visit_occurrence'
            ],
            'date_column': [
                None, None, 'measurement_date', 'observation_date',
                'procedure_date', None
            ],
            'start_date_column': [
                'condition_start_date', 'drug_exposure_start_date', None, None,
                None, 'visit_start_date'
            ],
            'end_date_column': [
                'condition_end_date', 'drug_exposure_end_date', None, None,
                None, 'visit_end_date'
            ]
        }
        retraction_info = pandas.DataFrame(data=d)
        mock_retraction_info.return_value = retraction_info

        load_data_queries = []
        dropped_row_count_queries = []
        kept_row_count_queries = []
        sandbox_row_count_queries = []

        # Queries to load the dummy data into the tables
        measurement_query = jinja_env.from_string("""
        INSERT INTO `{{project}}.{{dataset}}.{{measurement}}`
        (measurement_id, person_id, measurement_concept_id, measurement_date,
        measurement_type_concept_id)
        VALUES
            (1234, 1, 0, date('2017-12-07'), 0),
            (5678, 2, 0, date('2017-12-07'), 0),
            (2345, 3, 0, date('2018-12-07'), 0)""").render(
            project=self.project_id,
            dataset=self.dataset_id,
            measurement=TABLES[0])
        load_data_queries.append(measurement_query)

        observation_query = jinja_env.from_string("""
        INSERT INTO `{{project}}.{{dataset}}.{{observation}}`
        (observation_id, person_id, observation_concept_id, observation_date,
        observation_type_concept_id)
        VALUES
            (1234, 1, 0, date('2017-12-07'), 0),
            (5678, 2, 0, date('2017-12-07'), 0),
            (2345, 3, 0, date('2018-12-07'), 0)""").render(
            project=self.project_id,
            dataset=self.dataset_id,
            observation=TABLES[1])
        load_data_queries.append(observation_query)

        procedure_occ_query = jinja_env.from_string(
            """
        INSERT INTO `{{project}}.{{dataset}}.{{procedure}}`
        (procedure_occurrence_id, person_id, procedure_concept_id, procedure_date,
        procedure_datetime, procedure_type_concept_id)
        VALUES
            (1234, 1, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), 0),
            (5678, 2, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), 0), 
            (2345, 3, 0, date('2018-12-07'), timestamp('2018-12-07T08:21:14'), 0)"""
        ).render(project=self.project_id,
                 dataset=self.dataset_id,
                 procedure=TABLES[2])
        load_data_queries.append(procedure_occ_query)

        condition_occ_query = jinja_env.from_string(
            """
        INSERT INTO `{{project}}.{{dataset}}.{{condition}}`
        (condition_occurrence_id, person_id, condition_concept_id, condition_start_date,
        condition_start_datetime, condition_end_date, condition_type_concept_id)
        VALUES
            (1234, 1, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0),
            (5678, 2, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0), 
            (2345, 3, 0, date('2018-12-07'), timestamp('2018-12-07T08:21:14'), date('2018-12-08'), 0)"""
        ).render(project=self.project_id,
                 dataset=self.dataset_id,
                 condition=TABLES[3])
        load_data_queries.append(condition_occ_query)

        drug_query = jinja_env.from_string("""
        INSERT INTO `{{project}}.{{dataset}}.{{drug}}`
        (drug_exposure_id, person_id, drug_concept_id, drug_exposure_start_date, 
        drug_exposure_start_datetime, drug_exposure_end_date, drug_type_concept_id)
        VALUES
            (1234, 1, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0),
            (5678, 2, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0), 
            (2345, 3, 0, date('2018-12-07'), timestamp('2018-12-07T08:21:14'), date('2018-12-08'), 0)"""
                                           ).render(project=self.project_id,
                                                    dataset=self.dataset_id,
                                                    drug=TABLES[4])
        load_data_queries.append(drug_query)

        visit_query = jinja_env.from_string("""
        INSERT INTO `{{project}}.{{dataset}}.{{visit}}`
        (visit_occurrence_id, person_id, visit_concept_id, visit_start_date, visit_start_datetime,
        visit_end_date, visit_type_concept_id)
        VALUES
            (1234, 1, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0),
            (5678, 2, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0), 
            (2345, 3, 0, date('2018-12-07'), timestamp('2018-12-07T08:21:14'), date('2018-12-08'), 0)"""
                                            ).render(project=self.project_id,
                                                     dataset=self.dataset_id,
                                                     visit=TABLES[5])
        load_data_queries.append(visit_query)

        # Create tables
        fq_table_names = []
        for table_name in TABLES:
            fq_table_names.append(
                f'{self.project_id}.{self.dataset_id}.{table_name}')
        bq.create_tables(self.client,
                         self.project_id,
                         fq_table_names,
                         exists_ok=True)

        # Load queries
        for query in load_data_queries:
            response = self.client.query(query)
            self.assertIsNotNone(response.result())
            self.assertIsNone(response.exception())

        # Store query for checking number of rows to delete
        for ehr in self.deactivated_participants:
            pid = ehr[0]
            for row in retraction_info.itertuples(index=False):
                if row.date_column is None:
                    dropped_query = rdpt.EXPECTED_DROPPED_ROWS_QUERY_END_DATE.format(
                        dataset_id=self.dataset_id,
                        table_id=row.table,
                        pid_table_id=self.tablename,
                        pid=pid,
                        start_date_column=row.start_date_column,
                        end_date_column=row.end_date_column)
                    kept_query = rdpt.EXPECTED_KEPT_ROWS_QUERY_END_DATE.format(
                        dataset_id=self.dataset_id,
                        table_id=row.table,
                        pid_table_id=self.tablename,
                        pid=pid,
                        start_date_column=row.start_date_column,
                        end_date_column=row.end_date_column)
                    sandbox_query = rdp.SANDBOX_QUERY_END_DATE.render(
                        project=self.project_id,
                        dataset=self.dataset_id,
                        table=row.table,
                        pid=pid,
                        end_date_column=row.end_date_column,
                        start_date_column=row.start_date_column,
                        deactivated_pids_project=self.project_id,
                        deactivated_pids_dataset=self.dataset_id,
                        deactivated_pids_table=self.tablename)
                else:
                    dropped_query = rdpt.EXPECTED_DROPPED_ROWS_QUERY.format(
                        dataset_id=self.dataset_id,
                        table_id=row.table,
                        pid_table_id=self.tablename,
                        pid=pid,
                        date_column=row.date_column)
                    kept_query = rdpt.EXPECTED_KEPT_ROWS_QUERY.format(
                        dataset_id=self.dataset_id,
                        table_id=row.table,
                        pid_table_id=self.tablename,
                        pid=pid,
                        date_column=row.date_column)
                    sandbox_query = rdp.SANDBOX_QUERY_DATE.render(
                        project=self.project_id,
                        dataset=self.dataset_id,
                        table=row.table,
                        pid=pid,
                        date_column=row.date_column,
                        deactivated_pids_project=self.project_id,
                        deactivated_pids_dataset=self.dataset_id,
                        deactivated_pids_table=self.tablename)
                dropped_row_count_queries.append({
                    clean_consts.QUERY:
                    dropped_query,
                    clean_consts.DESTINATION_DATASET:
                    self.dataset_id,
                    clean_consts.DESTINATION_TABLE:
                    row.table
                })
                kept_row_count_queries.append({
                    clean_consts.QUERY:
                    kept_query,
                    clean_consts.DESTINATION_DATASET:
                    self.dataset_id,
                    clean_consts.DESTINATION_TABLE:
                    row.table
                })
                sandbox_row_count_queries.append({
                    clean_consts.QUERY:
                    sandbox_query,
                    clean_consts.DESTINATION_DATASET:
                    self.sandbox_id,
                    clean_consts.DESTINATION_TABLE:
                    self.tablename
                })

        # Use query results to count number of expected dropped row deletions
        expected_dropped_row_count = {}
        for query_dict in dropped_row_count_queries:
            response = self.client.query(query_dict['query'])
            result = response.result()
            if query_dict[
                    'destination_table_id'] in expected_dropped_row_count:
                expected_dropped_row_count[
                    query_dict['destination_table_id']] += result.total_rows
            else:
                expected_dropped_row_count[
                    query_dict['destination_table_id']] = result.total_rows

        # Separate check to find number of actual deleted rows
        q = rdpt.TABLE_ROWS_QUERY.format(dataset_id=self.dataset_id)
        q_result = self.client.query(q)
        row_count_before_retraction = {}
        for row in q_result:
            row_count_before_retraction[row['table_id']] = row['row_count']

        # Use query results to count number of expected dropped row deletions
        expected_kept_row_count = {}
        for query_dict in kept_row_count_queries:
            response = self.client.query(query_dict['query'])
            result = response.result()
            if query_dict['destination_table_id'] in expected_kept_row_count:
                expected_kept_row_count[
                    query_dict['destination_table_id']] -= (
                        (row_count_before_retraction[
                            query_dict['destination_table_id']] -
                         result.total_rows))
            else:
                expected_kept_row_count[query_dict['destination_table_id']] = (
                    row_count_before_retraction[
                        query_dict['destination_table_id']] -
                    (row_count_before_retraction[
                        query_dict['destination_table_id']] -
                     result.total_rows))

        # Perform retraction
        query_list = red.remove_ehr_data_queries(self.project_id,
                                                 self.ticket_number,
                                                 self.project_id,
                                                 self.dataset_id,
                                                 self.tablename)
        rdp.run_queries(query_list, self.client)

        # Find actual deleted rows
        q_result = self.client.query(q)
        results = q_result.result()
        row_count_after_retraction = {}
        for row in results:
            row_count_after_retraction[row['table_id']] = row['row_count']

        for table in expected_dropped_row_count:
            self.assertEqual(
                expected_dropped_row_count[table],
                row_count_before_retraction[table] -
                row_count_after_retraction[table])

        for table in expected_kept_row_count:
            self.assertEqual(expected_kept_row_count[table],
                             row_count_after_retraction[table])