def setUp(self):
     self.project_id = app_identity.get_application_id()
     self.dataset_id = bq_utils.get_rdr_dataset_id()
     self.sandbox_dataset_id = sandbox.get_sandbox_dataset_id(
         self.dataset_id)
     sandbox.check_and_create_sandbox_dataset(self.project_id,
                                              self.dataset_id)
Esempio n. 2
0
 def setUp(self):
     self.project_id = get_application_id()
     self.dataset_id = os.environ.get('UNIONED_DATASET_ID')
     self.sandbox_dataset_id = sandbox.check_and_create_sandbox_dataset(
         self.project_id, self.dataset_id)
     self.client = bq.get_client(self.project_id)
     self.delete_sandbox()
    def setUp(self):
        self.project_id = app_identity.get_application_id()
        self.dataset_id = bq_utils.get_dataset_id()
        self.sandbox_id = check_and_create_sandbox_dataset(
            self.project_id, self.dataset_id)
        self.tablename = '_deactivated_participants'
        self.ticket_number = 'DC12345'

        self.columns = ['participantId', 'suspensionStatus', 'suspensionTime']

        self.deactivated_participants = [(1, 'NO_CONTACT', '2018-12-07'),
                                         (2, 'NO_CONTACT', '2019-12-07'),
                                         (3, 'NO_CONTACT', '2017-12-07')]

        self.json_response_entry = {
            'entry': [{
                'fullUrl':
                'https//foo_project.appspot.com/rdr/v1/Participant/P1/Summary',
                'resource': {
                    'participantId': 'P1',
                    'suspensionStatus': 'NO_CONTACT',
                    'suspensionTime': '2018-12-07T08:21:14'
                }
            }, {
                'fullUrl':
                'https//foo_project.appspot.com/rdr/v1/Participant/P2/Summary',
                'resource': {
                    'participantId': 'P2',
                    'suspensionStatus': 'NO_CONTACT',
                    'suspensionTime': '2019-12-07T08:21:14'
                }
            }, {
                'fullUrl':
                'https//foo_project.appspot.com/rdr/v1/Participant/P3/Summary',
                'resource': {
                    'participantId': 'P3',
                    'suspensionStatus': 'NO_CONTACT',
                    'suspensionTime': '2017-12-07T08:21:14'
                }
            }]
        }

        self.client = bq.get_client(self.project_id)
def create_queries(project_id,
                   ticket_number,
                   pids_project_id,
                   pids_dataset_id,
                   pids_table,
                   datasets=None):
    """
    Creates sandbox and truncate queries to run for EHR deactivated retraction

    :param project_id: Identifies the project where data is being retracted
    :param ticket_number: Jira ticket number to identify and title sandbox table
    :param pids_project_id: Identifies the project containing deactivated pids table
    :param pids_dataset_id: Identifies the dataset containing deactivated pids table
    :param pids_table: Name of the deactivated pids table. This table should have
      the following fields: (person_id: int, suspension_status: string, deactivated_date: date).
    :param datasets: (optional) List of datasets to retract from. If not provided,
      retraction will be performed from all datasets in project referred to by `project_id`.
    :return: list of query dictionaries
    
    NOTE: For dataset_ids matching `retraction.retract_utils.DEID_REGEX`, associated research_ids 
    retrieved from an inferred combined dataset are used for retraction.
    """
    queries_list = []
    dataset_list = set()
    final_date_column_df = pd.DataFrame()
    # Hit bq and receive df of deactivated ehr pids and deactivated date
    client = get_client(project_id)
    deactivated_ehr_pids_df = client.query(
        DEACTIVATED_PIDS_QUERY.render(project=pids_project_id,
                                      dataset=pids_dataset_id,
                                      table=pids_table)).to_dataframe()
    if datasets is None:
        date_columns_df = get_date_info_for_pids_tables(project_id, client)
    else:
        date_columns_df = get_date_info_for_pids_tables(project_id, client,
                                                        datasets)
    LOGGER.info(
        "Dataframe creation complete. DF to be used for creation of retraction queries."
    )
    for date_row in date_columns_df.itertuples(index=False):
        # Filter to only include tables containing deactivated pids with the earliest deactivated date
        LOGGER.info(
            f'Checking table: {date_row.project_id}.{date_row.dataset_id}.{date_row.table}'
        )
        if check_pid_exist(date_row, client, pids_project_id, pids_dataset_id,
                           pids_table):
            dataset_list.add(date_row.dataset_id)
            row = {
                'project_id': date_row.project_id,
                'dataset_id': date_row.dataset_id,
                'table': date_row.table,
                'date_column': date_row.date_column,
                'start_date_column': date_row.start_date_column,
                'end_date_column': date_row.end_date_column
            }
            final_date_column_df = final_date_column_df.append(
                row, ignore_index=True)

    LOGGER.info(
        "Looping through the deactivated PIDS df to create queries based on the retractions needed per PID table"
    )
    for ehr_row in deactivated_ehr_pids_df.itertuples(index=False):
        LOGGER.info(f'Creating retraction queries for PID: {ehr_row.person_id}')
        for date_row in final_date_column_df.itertuples(index=False):
            # Determine if dataset is deid to correctly pull pid or research_id and check if ID exists in dataset or if
            # already retracted
            if re.match(DEID_REGEX, date_row.dataset_id):
                pid = get_research_id(date_row.project_id, date_row.dataset_id,
                                      ehr_row.person_id, client)
            else:
                pid = ehr_row.person_id

            # Get or create sandbox dataset
            sandbox_dataset = check_and_create_sandbox_dataset(
                date_row.project_id, date_row.dataset_id)

            # Create queries based on type of date field
            LOGGER.info(
                f'Creating Query to retract {pid} from {date_row.dataset_id}.{date_row.table}'
            )
            if pd.isnull(date_row.date_column):
                sandbox_query = SANDBOX_QUERY_END_DATE.render(
                    project=date_row.project_id,
                    sandbox_dataset=sandbox_dataset,
                    dataset=date_row.dataset_id,
                    table=date_row.table,
                    pid=pid,
                    deactivated_pids_project=pids_project_id,
                    deactivated_pids_dataset=pids_dataset_id,
                    deactivated_pids_table=pids_table,
                    end_date_column=date_row.end_date_column,
                    start_date_column=date_row.start_date_column)
                clean_query = CLEAN_QUERY_END_DATE.render(
                    project=date_row.project_id,
                    dataset=date_row.dataset_id,
                    table=date_row.table,
                    pid=pid,
                    deactivated_pids_project=pids_project_id,
                    deactivated_pids_dataset=pids_dataset_id,
                    deactivated_pids_table=pids_table,
                    end_date_column=date_row.end_date_column,
                    start_date_column=date_row.start_date_column)
            else:
                sandbox_query = SANDBOX_QUERY_DATE.render(
                    project=date_row.project_id,
                    sandbox_dataset=sandbox_dataset,
                    dataset=date_row.dataset_id,
                    table=date_row.table,
                    pid=pid,
                    deactivated_pids_project=pids_project_id,
                    deactivated_pids_dataset=pids_dataset_id,
                    deactivated_pids_table=pids_table,
                    date_column=date_row.date_column)
                clean_query = CLEAN_QUERY_DATE.render(
                    project=date_row.project_id,
                    dataset=date_row.dataset_id,
                    table=date_row.table,
                    pid=pid,
                    deactivated_pids_project=pids_project_id,
                    deactivated_pids_dataset=pids_dataset_id,
                    deactivated_pids_table=pids_table,
                    date_column=date_row.date_column)
            queries_list.append({
                clean_consts.QUERY:
                    sandbox_query,
                clean_consts.DESTINATION:
                    date_row.project_id + '.' + sandbox_dataset + '.' +
                    (ticket_number + '_' + date_row.table),
                clean_consts.DESTINATION_DATASET:
                    date_row.dataset_id,
                clean_consts.DESTINATION_TABLE:
                    date_row.table,
                clean_consts.DISPOSITION:
                    bq_consts.WRITE_APPEND,
                'type':
                    'sandbox'
            })
            queries_list.append({
                clean_consts.QUERY:
                    clean_query,
                clean_consts.DESTINATION:
                    date_row.project_id + '.' + date_row.dataset_id + '.' +
                    date_row.table,
                clean_consts.DESTINATION_DATASET:
                    date_row.dataset_id,
                clean_consts.DESTINATION_TABLE:
                    date_row.table,
                clean_consts.DISPOSITION:
                    bq_consts.WRITE_TRUNCATE,
                'type':
                    'retraction'
            })
    LOGGER.info(
        f"Query list complete, retracting ehr deactivated PIDS from the following datasets: "
        f"{dataset_list}")
    return queries_list