Exemple #1
0
    def setup_rule(self, client=None):
        """
        Load required resources prior to executing cleaning rule queries.

        Method to run data upload options before executing the first cleaning
        rule of a class.  For example, if your class requires loading a static
        table, that load operation should be defined here.  It SHOULD NOT BE
        defined as part of get_query_specs().
        :param client:
        :return:

        :raises:  BadRequest, OSError, AttributeError, TypeError, ValueError if
            the load job fails. Error raised from bq.upload_csv_data_to_bq_table
            helper function.
        """

        # creating _unit_mapping table
        unit_mapping_table = (f'{self.project_id}.'
                              f'{self.sandbox_dataset_id}.'
                              f'{UNIT_MAPPING_TABLE}')
        bq.create_tables(
            client,
            self.project_id,
            [unit_mapping_table],
        )
        # Uploading data to _unit_mapping table
        unit_mappings_csv_path = os.path.join(resources.resource_files_path,
                                              UNIT_MAPPING_FILE)
        result = bq.upload_csv_data_to_bq_table(
            client, self.sandbox_dataset_id, UNIT_MAPPING_TABLE,
            unit_mappings_csv_path, UNIT_MAPPING_TABLE_DISPOSITION)
        LOGGER.info(
            f"Created {self.sandbox_dataset_id}.{UNIT_MAPPING_TABLE} and "
            f"loaded data from {unit_mappings_csv_path}")
Exemple #2
0
    def setup_rule(self, client, *args, **keyword_args):

        # Create _logging_standard_concept_id_replacement
        fq_table_names = [
            f'{self.project_id}.{self.sandbox_dataset_id}.{SRC_CONCEPT_ID_TABLE_NAME}'
        ]
        create_tables(client, self.project_id, fq_table_names, exists_ok=True)
Exemple #3
0
    def setUp(self):
        self.project_id = app_identity.get_application_id()
        self.dataset_id = os.environ.get('UNIONED_DATASET_ID')

        self.fq_person = f'{self.project_id}.{self.dataset_id}.person'
        self.fq_person_ext = f'{self.project_id}.{self.dataset_id}.person_ext'
        self.table_ids = [self.fq_person_ext, self.fq_person]

        self.person = gbq.Table.from_string(self.fq_person)
        self.person_ext = gbq.Table.from_string(self.fq_person_ext)

        self.client = bq.get_client(self.project_id)
        self.tearDown()
        for fq_table in self.table_ids:
            table_id = fq_table.split('.')[2]
            if table_id == 'person':
                bq_utils.create_standard_table(table_id,
                                               table_id,
                                               drop_existing=True,
                                               dataset_id=self.dataset_id)
            else:
                bq.create_tables(self.client,
                                 self.project_id, [fq_table],
                                 exists_ok=True)

        self.populate_tables()
Exemple #4
0
    def setup_rule(self, client=None):
        """
        Load required resources prior to executing cleaning rule queries.

        Method to run data upload options before executing the first cleaning
        rule of a class.  For example, if your class requires loading a static
        table, that load operation should be defined here.  It SHOULD NOT BE
        defined as part of get_query_specs().
        :param client:
        :return:
        """

        # creating _unit_mapping table
        unit_mapping_table = f'{self.project_id}.{self.dataset_id}.{UNIT_MAPPING_TABLE}'
        bq.create_tables(
            client,
            self.project_id,
            [unit_mapping_table],
        )
        # Uploading data to _unit_mapping table
        unit_mappings_csv_path = os.path.join(resources.resource_files_path,
                                              UNIT_MAPPING_FILE)
        job = bq.upload_csv_data_to_bq_table(client, self.dataset_id,
                                             UNIT_MAPPING_TABLE,
                                             unit_mappings_csv_path,
                                             UNIT_MAPPING_TABLE_DISPOSITION)
        job.result()
        LOGGER.info(
            f"Created {self.dataset_id}.{UNIT_MAPPING_TABLE} and loaded data from {unit_mappings_csv_path}"
        )
Exemple #5
0
        def setUpClass(cls):
            # get the test project
            if 'test' not in cls.project_id:
                raise RuntimeError(
                    f'Tests should only run in a test environment.  '
                    f'Current environment is {cls.project_id} .')

            if not cls.fq_table_names:
                raise RuntimeError(
                    f'Provide a list of fully qualified table names the '
                    f'test will manipulate.')

            cls.client = bq.get_client(cls.project_id)

            # get or create datasets, cleaning rules can assume the datasets exist
            required_datasets = []
            for table_name in cls.fq_table_names + cls.fq_sandbox_table_names:
                dataset_id = table_name.split('.')[1]
                required_datasets.append(dataset_id)

            desc = (f"dataset created by {cls.__name__} to test a "
                    f"cleaning rule.  deletion candidate.")
            for dataset_id in set(required_datasets):
                dataset = bq.define_dataset(cls.project_id, dataset_id, desc,
                                            {'test': ''})
                cls.client.create_dataset(dataset, exists_ok=True)

            bq.create_tables(cls.client, cls.project_id, cls.fq_table_names,
                             True)
Exemple #6
0
    def setUp(self):
        self.project_id = app_identity.get_application_id()
        self.dataset_id = os.environ.get('UNIONED_DATASET_ID')

        self.fq_person = f'{self.project_id}.{self.dataset_id}.person'
        self.fq_person_ext = f'{self.project_id}.{self.dataset_id}.person_ext'
        self.table_ids = [self.fq_person, self.fq_person_ext]

        self.person = Table.from_string(self.fq_person)
        self.person_ext = Table.from_string(self.fq_person_ext)

        self.client = bq.get_client(self.project_id)
        for table in self.table_ids:
            self.client.delete_table(table, not_found_ok=True)
        bq.create_tables(self.client, self.project_id, self.table_ids)
        self.populate_tables()
Exemple #7
0
def get_domain_mapping_queries(project_id, dataset_id):
    """
    This function generates a list of query dicts for creating id mappings in _logging_domain_alignment.
    The list will get consumed clean_engine

    :param project_id: the project_id in which the query is run
    :param dataset_id: the dataset_id in which the query is run
    :return: a list of query dicts for creating id mappings in _logging_domain_alignment
    """
    # Create _logging_domain_alignment
    client = bq.get_client(project_id)
    table_id = f'{project_id}.{dataset_id}.{DOMAIN_ALIGNMENT_TABLE_NAME}'
    client.delete_table(table_id, not_found_ok=True)
    bq.create_tables(client, project_id, [table_id], exists_ok=False)

    domain_mapping_queries = []

    for domain_table in domain_mapping.DOMAIN_TABLE_NAMES:
        query = parse_domain_mapping_query_cross_domain(
            project_id, dataset_id, domain_table)
        domain_mapping_queries.append(query)

    # Create the query for creating field_mappings for the records moving between the same domain
    query = parse_domain_mapping_query_for_same_domains(project_id, dataset_id)
    domain_mapping_queries.append(query)

    # Create the query for the records that are in the wrong domain but will not be moved
    query = parse_domain_mapping_query_for_excluded_records(
        project_id, dataset_id)
    domain_mapping_queries.append(query)

    unioned_query = {
        cdr_consts.QUERY: UNION_ALL.join(domain_mapping_queries),
        cdr_consts.DESTINATION_TABLE: DOMAIN_ALIGNMENT_TABLE_NAME,
        cdr_consts.DISPOSITION: bq_consts.WRITE_EMPTY,
        cdr_consts.DESTINATION_DATASET: dataset_id
    }

    return [unioned_query]
    def setup_data(self):
        self.tearDown()
        # setup deactivated participants table
        deact_table_ref = gbq.TableReference.from_string(self.deact_table)
        bq.create_tables(self.client,
                         self.project_id, [self.deact_table],
                         exists_ok=True)
        job_config = gbq.QueryJobConfig()
        job = self.client.query(
            DEACTIVATED_PIDS.render(deact_table=deact_table_ref), job_config)
        job.result()

        # create omop tables and mapping/ext tables
        for table in self.tables:
            fq_table = f'{self.project_id}.{self.dataset_id}.{table}'
            bq.create_tables(self.client,
                             self.project_id, [fq_table],
                             exists_ok=True)
            table_ref = gbq.TableReference.from_string(fq_table)
            job_config = gbq.QueryJobConfig()
            job = self.client.query(self.tables[table].render(table=table_ref),
                                    job_config)
            job.result()
 def setUp(self):
     bq.create_tables(self.client, self.project_id, self.fq_table_names,
                      True)
Exemple #10
0
def check_and_copy_tables(project_id, dataset_id):
    """
    Will check that all the required tables exist and if not, they will be created
    or copied from another table.

    :param project_id: Project where the dataset resides
    :param dataset_id: Dataset where the required lab table needs to be created
    :return: None
    """

    descendants_table_name = f'{project_id}.{dataset_id}.{MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE}'
    concept_sets_table_name = f'{project_id}.{dataset_id}.{MEASUREMENT_CONCEPT_SETS_TABLE}'

    client = bq.get_client(project_id)
    dataset = client.dataset(dataset_id)
    vocab_dataset = client.dataset(common.VOCABULARY_DATASET)

    # concept table and concept ancestor table source tables
    concept_source_table = vocab_dataset.table(common.CONCEPT)
    concept_ancestor_source_table = vocab_dataset.table(common.CONCEPT_ANCESTOR)

    # concept table and concept ancestor table destination tables
    concept_dest_table = dataset.table(common.CONCEPT)
    concept_ancestor_dest_table = dataset.table(common.CONCEPT_ANCESTOR)

    # query will check the row counts of each table in the specified dataset
    row_count_query = ROW_COUNT_QUERY.render(project_id=project_id,
                                             dataset_id=dataset_id)

    results_dataframe = client.query(row_count_query).to_dataframe()
    empty_results_dataframe = results_dataframe[(
        results_dataframe['row_count'] == 0)]

    # checks if CONCEPT and CONCEPT_ANCESTOR tables exist, if they don't, they are copied from the
    # CONCEPT and CONCEPT_ANCESTOR tables in common.VOCABULARY
    if common.CONCEPT not in (results_dataframe['table_id']).values:
        client.copy_table(concept_source_table, concept_dest_table)
    if common.CONCEPT_ANCESTOR not in (results_dataframe['table_id']).values:
        client.copy_table(concept_ancestor_source_table,
                          concept_ancestor_dest_table)

    # checks if CONCEPT and CONCEPT_ANCESTOR tables are empty, if they are, they are copied from the CONCEPT and
    # CONCEPT_ANCESTOR tables in common.VOCABULARY
    if common.CONCEPT in (empty_results_dataframe['table_id']).values:
        client.copy_table(concept_source_table, concept_dest_table)
    if common.CONCEPT_ANCESTOR in (empty_results_dataframe['table_id']).values:
        client.copy_table(concept_ancestor_source_table,
                          concept_ancestor_dest_table)

    # checks if MEASUREMENT_CONCEPT_SETS_TABLE and MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE exist, if they
    # do not exist, they will be created
    if MEASUREMENT_CONCEPT_SETS_TABLE not in results_dataframe[
            'table_id'].values:
        bq.create_tables(client=client,
                         project_id=project_id,
                         fq_table_names=[concept_sets_table_name],
                         exists_ok=True,
                         fields=None)
    if MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE not in results_dataframe[
            'table_id'].values:
        bq.create_tables(client=client,
                         project_id=project_id,
                         fq_table_names=[descendants_table_name],
                         exists_ok=True,
                         fields=None)
    def test_integration_queries_to_retract_from_fake_dataset(
            self, mock_list_datasets, mock_is_ehr_dataset,
            mock_is_unioned_dataset, mock_is_combined_dataset,
            mock_is_deid_dataset):
        mock_list_datasets.return_value = [self.bq_dataset_id]
        mock_is_deid_dataset.return_value = False
        mock_is_combined_dataset.return_value = False
        mock_is_unioned_dataset.return_value = False
        mock_is_ehr_dataset.return_value = True

        # create and load person_ids to pid table
        bq.create_tables(
            self.client,
            self.test_project_id, [
                f'{self.test_project_id}.{self.bq_dataset_id}.{self.pid_table_id}'
            ],
            exists_ok=False,
            fields=[rbq.PID_TABLE_FIELDS])
        bq_formatted_insert_values = ', '.join([
            f'({person_id}, {research_id})'
            for (person_id, research_id) in self.person_research_ids
        ])
        q = INSERT_PID_TABLE.format(
            dataset_id=self.bq_dataset_id,
            pid_table_id=self.pid_table_id,
            person_research_ids=bq_formatted_insert_values)
        job = self.client.query(q)
        job.result()

        row_count_queries = {}
        # load the cdm files into dataset
        for cdm_file in test_util.NYC_FIVE_PERSONS_FILES:
            cdm_file_name = os.path.basename(cdm_file)
            cdm_table = cdm_file_name.split('.')[0]
            hpo_table = f'{self.hpo_id}_{cdm_table}'
            # store query for checking number of rows to delete
            row_count_queries[hpo_table] = EXPECTED_ROWS_QUERY.format(
                dataset_id=self.bq_dataset_id,
                table_id=hpo_table,
                pid_table_id=self.pid_table_id)
            logging.info(
                f'Preparing to load table {self.bq_dataset_id}.{hpo_table}')
            with open(cdm_file, 'rb') as f:
                job_config = bigquery.LoadJobConfig()
                job_config.source_format = bigquery.SourceFormat.CSV
                job_config.skip_leading_rows = 1
                job_config.write_disposition = 'WRITE_EMPTY'
                job_config.schema = bq.get_table_schema(cdm_table)
                load_job = self.client.load_table_from_file(
                    f,
                    f'{self.test_project_id}.{self.bq_dataset_id}.{hpo_table}',
                    job_config=job_config)
                load_job.result()
        logging.info('All tables loaded successfully')

        # use query results to count number of expected row deletions
        expected_row_count = {}
        for table in row_count_queries:
            job = self.client.query(row_count_queries[table])
            result = job.result()
            expected_row_count[table] = result.to_dataframe()['count'].to_list(
            )[0]

        # separate check to find number of actual deleted rows
        q = TABLE_ROWS_QUERY.format(dataset_id=self.bq_dataset_id)
        job = self.client.query(q)
        result = job.result().to_dataframe()
        row_counts_before_retraction = pd.Series(
            result.row_count.values, index=result.table_id).to_dict()

        # perform retraction
        rbq.run_bq_retraction(self.test_project_id, self.bq_dataset_id,
                              self.test_project_id, self.pid_table_id,
                              self.hpo_id, self.dataset_ids,
                              self.retraction_type)

        # find actual deleted rows
        job = self.client.query(q)
        result = job.result().to_dataframe()
        row_counts_after_retraction = pd.Series(
            result.row_count.values, index=result.table_id).to_dict()

        for table in expected_row_count:
            self.assertEqual(
                expected_row_count[table],
                row_counts_before_retraction[table] -
                row_counts_after_retraction[table])
    def test_integration_queries_to_retract_from_fake_dataset(
        self, mock_retraction_info):
        d = {
            'project_id': [
                self.project_id, self.project_id, self.project_id,
                self.project_id, self.project_id, self.project_id
            ],
            'dataset_id': [
                self.bq_dataset_id, self.bq_dataset_id, self.bq_dataset_id,
                self.bq_dataset_id, self.bq_dataset_id, self.bq_dataset_id
            ],
            'table': [
                'fake_condition_occurrence', 'fake_drug_exposure',
                'fake_measurement', 'fake_observation',
                'fake_procedure_occurrence', 'fake_visit_occurrence'
            ],
            'date_column': [
                None, None, 'measurement_date', 'observation_date',
                'procedure_date', None
            ],
            'start_date_column': [
                'condition_start_date', 'drug_exposure_start_date', None, None,
                None, 'visit_start_date'
            ],
            'end_date_column': [
                'condition_end_date', 'drug_exposure_end_date', None, None,
                None, 'visit_end_date'
            ]
        }
        retraction_info = pd.DataFrame(data=d)
        mock_retraction_info.return_value = retraction_info

        # Create and load person_ids and deactivated_date to pid table
        bq.create_tables(self.client,
                         self.project_id,
                         self.pid_table_id_list,
                         exists_ok=False,
                         fields=retract_deactivated_pids.PID_TABLE_FIELDS)
        bq_formatted_insert_values = ', '.join([
            '(%s, "%s")' % (person_id, deactivated_date)
            for (person_id,
                 deactivated_date) in self.deactivated_ehr_participants
        ])
        q = INSERT_PID_TABLE.format(
            dataset_id=self.bq_dataset_id,
            pid_table_id=self.pid_table_id,
            person_research_ids=bq_formatted_insert_values)
        self.client.query(q)

        job_ids = []
        dropped_row_count_queries = []
        kept_row_count_queries = []
        hpo_table_list = []

        # Load the cdm files into dataset
        for cdm_file in test_util.NYC_FIVE_PERSONS_FILES:
            cdm_file_name = os.path.basename(cdm_file)
            cdm_table = cdm_file_name.split('.')[0]
            hpo_table = bq_utils.get_table_id(self.hpo_id, cdm_table)
            # Do not process if person table
            if hpo_table == 'fake_person':
                continue
            hpo_table_list.append(hpo_table)
            logging.info(
                f'Preparing to load table {self.bq_dataset_id}.{hpo_table}')
            with open(cdm_file, 'rb') as f:
                gcs_utils.upload_object(gcs_utils.get_hpo_bucket(self.hpo_id),
                                        cdm_file_name, f)
            result = bq_utils.load_cdm_csv(self.hpo_id,
                                           cdm_table,
                                           dataset_id=self.bq_dataset_id)
            logging.info(f'Loading table {self.bq_dataset_id}.{hpo_table}')
            job_id = result['jobReference']['jobId']
            job_ids.append(job_id)

        incomplete_jobs = bq_utils.wait_on_jobs(job_ids)
        self.assertEqual(len(incomplete_jobs), 0,
                         'NYC five person load job did not complete')
        logging.info('All tables loaded successfully')

        # Store query for checking number of rows to delete
        for ehr in self.deactivated_ehr_participants:
            pid = ehr[0]
            for row in retraction_info.itertuples(index=False):
                if row.date_column is None:
                    dropped_query = EXPECTED_DROPPED_ROWS_QUERY_END_DATE.format(
                        dataset_id=self.bq_dataset_id,
                        table_id=row.table,
                        pid_table_id=self.pid_table_id,
                        pid=pid,
                        start_date_column=row.start_date_column,
                        end_date_column=row.end_date_column)
                    kept_query = EXPECTED_KEPT_ROWS_QUERY_END_DATE.format(
                        dataset_id=self.bq_dataset_id,
                        table_id=row.table,
                        pid_table_id=self.pid_table_id,
                        pid=pid,
                        start_date_column=row.start_date_column,
                        end_date_column=row.end_date_column)
                else:
                    dropped_query = EXPECTED_DROPPED_ROWS_QUERY.format(
                        dataset_id=self.bq_dataset_id,
                        table_id=row.table,
                        pid_table_id=self.pid_table_id,
                        pid=pid,
                        date_column=row.date_column)
                    kept_query = EXPECTED_KEPT_ROWS_QUERY.format(
                        dataset_id=self.bq_dataset_id,
                        table_id=row.table,
                        pid_table_id=self.pid_table_id,
                        pid=pid,
                        date_column=row.date_column)
                dropped_row_count_queries.append({
                    clean_consts.QUERY: dropped_query,
                    clean_consts.DESTINATION_DATASET: self.bq_dataset_id,
                    clean_consts.DESTINATION_TABLE: row.table
                })
                kept_row_count_queries.append({
                    clean_consts.QUERY: kept_query,
                    clean_consts.DESTINATION_DATASET: self.bq_dataset_id,
                    clean_consts.DESTINATION_TABLE: row.table
                })

        # Use query results to count number of expected dropped row deletions
        expected_dropped_row_count = {}
        for query_dict in dropped_row_count_queries:
            response = self.client.query(query_dict['query'])
            result = response.result()
            if query_dict['destination_table_id'] in expected_dropped_row_count:
                expected_dropped_row_count[
                    query_dict['destination_table_id']] += result.total_rows
            else:
                expected_dropped_row_count[
                    query_dict['destination_table_id']] = result.total_rows

        # Separate check to find number of actual deleted rows
        q = TABLE_ROWS_QUERY.format(dataset_id=self.bq_dataset_id)
        q_result = self.client.query(q)
        row_count_before_retraction = {}
        for row in q_result:
            row_count_before_retraction[row['table_id']] = row['row_count']

        # Use query results to count number of expected dropped row deletions
        expected_kept_row_count = {}
        for query_dict in kept_row_count_queries:
            response = self.client.query(query_dict['query'])
            result = response.result()
            if query_dict['destination_table_id'] in expected_kept_row_count:
                expected_kept_row_count[query_dict['destination_table_id']] -= (
                    (row_count_before_retraction[
                        query_dict['destination_table_id']] -
                     result.total_rows))
            else:
                expected_kept_row_count[query_dict['destination_table_id']] = (
                    row_count_before_retraction[
                        query_dict['destination_table_id']] -
                    (row_count_before_retraction[
                        query_dict['destination_table_id']] -
                     result.total_rows))

        # Perform retraction
        query_list = retract_deactivated_pids.create_queries(
            self.project_id, self.ticket_number, self.project_id,
            self.bq_dataset_id, self.pid_table_id)
        retract_deactivated_pids.run_queries(query_list, self.client)

        # Find actual deleted rows
        q_result = self.client.query(q)
        results = q_result.result()
        row_count_after_retraction = {}
        for row in results:
            row_count_after_retraction[row['table_id']] = row['row_count']

        for table in expected_dropped_row_count:
            self.assertEqual(
                expected_dropped_row_count[table],
                row_count_before_retraction[table] -
                row_count_after_retraction[table])

        for table in expected_kept_row_count:
            self.assertEqual(expected_kept_row_count[table],
                             row_count_after_retraction[table])
    def test_remove_ehr_data_past_deactivation_date(self, mock_retraction_info,
                                                    mock_get):
        # pre conditions for participant summary API module
        mock_get.return_value.status_code = 200
        mock_get.return_value.json.return_value = self.json_response_entry

        # Ensure deactivated participants table is created and or updated
        df = psr.get_deactivated_participants(self.project_id, self.dataset_id,
                                              self.tablename, self.columns)
        psr.store_participant_data(df, self.project_id,
                                   f'{self.dataset_id}.{self.tablename}')
        # pre conditions for retraction module
        d = {
            'project_id': [
                self.project_id, self.project_id, self.project_id,
                self.project_id, self.project_id, self.project_id
            ],
            'dataset_id': [
                self.dataset_id, self.dataset_id, self.dataset_id,
                self.dataset_id, self.dataset_id, self.dataset_id
            ],
            'table': [
                'condition_occurrence', 'drug_exposure', 'measurement',
                'observation', 'procedure_occurrence', 'visit_occurrence'
            ],
            'date_column': [
                None, None, 'measurement_date', 'observation_date',
                'procedure_date', None
            ],
            'start_date_column': [
                'condition_start_date', 'drug_exposure_start_date', None, None,
                None, 'visit_start_date'
            ],
            'end_date_column': [
                'condition_end_date', 'drug_exposure_end_date', None, None,
                None, 'visit_end_date'
            ]
        }
        retraction_info = pandas.DataFrame(data=d)
        mock_retraction_info.return_value = retraction_info

        load_data_queries = []
        dropped_row_count_queries = []
        kept_row_count_queries = []
        sandbox_row_count_queries = []

        # Queries to load the dummy data into the tables
        measurement_query = jinja_env.from_string("""
        INSERT INTO `{{project}}.{{dataset}}.{{measurement}}`
        (measurement_id, person_id, measurement_concept_id, measurement_date,
        measurement_type_concept_id)
        VALUES
            (1234, 1, 0, date('2017-12-07'), 0),
            (5678, 2, 0, date('2017-12-07'), 0),
            (2345, 3, 0, date('2018-12-07'), 0)""").render(
            project=self.project_id,
            dataset=self.dataset_id,
            measurement=TABLES[0])
        load_data_queries.append(measurement_query)

        observation_query = jinja_env.from_string("""
        INSERT INTO `{{project}}.{{dataset}}.{{observation}}`
        (observation_id, person_id, observation_concept_id, observation_date,
        observation_type_concept_id)
        VALUES
            (1234, 1, 0, date('2017-12-07'), 0),
            (5678, 2, 0, date('2017-12-07'), 0),
            (2345, 3, 0, date('2018-12-07'), 0)""").render(
            project=self.project_id,
            dataset=self.dataset_id,
            observation=TABLES[1])
        load_data_queries.append(observation_query)

        procedure_occ_query = jinja_env.from_string(
            """
        INSERT INTO `{{project}}.{{dataset}}.{{procedure}}`
        (procedure_occurrence_id, person_id, procedure_concept_id, procedure_date,
        procedure_datetime, procedure_type_concept_id)
        VALUES
            (1234, 1, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), 0),
            (5678, 2, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), 0), 
            (2345, 3, 0, date('2018-12-07'), timestamp('2018-12-07T08:21:14'), 0)"""
        ).render(project=self.project_id,
                 dataset=self.dataset_id,
                 procedure=TABLES[2])
        load_data_queries.append(procedure_occ_query)

        condition_occ_query = jinja_env.from_string(
            """
        INSERT INTO `{{project}}.{{dataset}}.{{condition}}`
        (condition_occurrence_id, person_id, condition_concept_id, condition_start_date,
        condition_start_datetime, condition_end_date, condition_type_concept_id)
        VALUES
            (1234, 1, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0),
            (5678, 2, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0), 
            (2345, 3, 0, date('2018-12-07'), timestamp('2018-12-07T08:21:14'), date('2018-12-08'), 0)"""
        ).render(project=self.project_id,
                 dataset=self.dataset_id,
                 condition=TABLES[3])
        load_data_queries.append(condition_occ_query)

        drug_query = jinja_env.from_string("""
        INSERT INTO `{{project}}.{{dataset}}.{{drug}}`
        (drug_exposure_id, person_id, drug_concept_id, drug_exposure_start_date, 
        drug_exposure_start_datetime, drug_exposure_end_date, drug_type_concept_id)
        VALUES
            (1234, 1, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0),
            (5678, 2, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0), 
            (2345, 3, 0, date('2018-12-07'), timestamp('2018-12-07T08:21:14'), date('2018-12-08'), 0)"""
                                           ).render(project=self.project_id,
                                                    dataset=self.dataset_id,
                                                    drug=TABLES[4])
        load_data_queries.append(drug_query)

        visit_query = jinja_env.from_string("""
        INSERT INTO `{{project}}.{{dataset}}.{{visit}}`
        (visit_occurrence_id, person_id, visit_concept_id, visit_start_date, visit_start_datetime,
        visit_end_date, visit_type_concept_id)
        VALUES
            (1234, 1, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0),
            (5678, 2, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0), 
            (2345, 3, 0, date('2018-12-07'), timestamp('2018-12-07T08:21:14'), date('2018-12-08'), 0)"""
                                            ).render(project=self.project_id,
                                                     dataset=self.dataset_id,
                                                     visit=TABLES[5])
        load_data_queries.append(visit_query)

        # Create tables
        fq_table_names = []
        for table_name in TABLES:
            fq_table_names.append(
                f'{self.project_id}.{self.dataset_id}.{table_name}')
        bq.create_tables(self.client,
                         self.project_id,
                         fq_table_names,
                         exists_ok=True)

        # Load queries
        for query in load_data_queries:
            response = self.client.query(query)
            self.assertIsNotNone(response.result())
            self.assertIsNone(response.exception())

        # Store query for checking number of rows to delete
        for ehr in self.deactivated_participants:
            pid = ehr[0]
            for row in retraction_info.itertuples(index=False):
                if row.date_column is None:
                    dropped_query = rdpt.EXPECTED_DROPPED_ROWS_QUERY_END_DATE.format(
                        dataset_id=self.dataset_id,
                        table_id=row.table,
                        pid_table_id=self.tablename,
                        pid=pid,
                        start_date_column=row.start_date_column,
                        end_date_column=row.end_date_column)
                    kept_query = rdpt.EXPECTED_KEPT_ROWS_QUERY_END_DATE.format(
                        dataset_id=self.dataset_id,
                        table_id=row.table,
                        pid_table_id=self.tablename,
                        pid=pid,
                        start_date_column=row.start_date_column,
                        end_date_column=row.end_date_column)
                    sandbox_query = rdp.SANDBOX_QUERY_END_DATE.render(
                        project=self.project_id,
                        dataset=self.dataset_id,
                        table=row.table,
                        pid=pid,
                        end_date_column=row.end_date_column,
                        start_date_column=row.start_date_column,
                        deactivated_pids_project=self.project_id,
                        deactivated_pids_dataset=self.dataset_id,
                        deactivated_pids_table=self.tablename)
                else:
                    dropped_query = rdpt.EXPECTED_DROPPED_ROWS_QUERY.format(
                        dataset_id=self.dataset_id,
                        table_id=row.table,
                        pid_table_id=self.tablename,
                        pid=pid,
                        date_column=row.date_column)
                    kept_query = rdpt.EXPECTED_KEPT_ROWS_QUERY.format(
                        dataset_id=self.dataset_id,
                        table_id=row.table,
                        pid_table_id=self.tablename,
                        pid=pid,
                        date_column=row.date_column)
                    sandbox_query = rdp.SANDBOX_QUERY_DATE.render(
                        project=self.project_id,
                        dataset=self.dataset_id,
                        table=row.table,
                        pid=pid,
                        date_column=row.date_column,
                        deactivated_pids_project=self.project_id,
                        deactivated_pids_dataset=self.dataset_id,
                        deactivated_pids_table=self.tablename)
                dropped_row_count_queries.append({
                    clean_consts.QUERY:
                    dropped_query,
                    clean_consts.DESTINATION_DATASET:
                    self.dataset_id,
                    clean_consts.DESTINATION_TABLE:
                    row.table
                })
                kept_row_count_queries.append({
                    clean_consts.QUERY:
                    kept_query,
                    clean_consts.DESTINATION_DATASET:
                    self.dataset_id,
                    clean_consts.DESTINATION_TABLE:
                    row.table
                })
                sandbox_row_count_queries.append({
                    clean_consts.QUERY:
                    sandbox_query,
                    clean_consts.DESTINATION_DATASET:
                    self.sandbox_id,
                    clean_consts.DESTINATION_TABLE:
                    self.tablename
                })

        # Use query results to count number of expected dropped row deletions
        expected_dropped_row_count = {}
        for query_dict in dropped_row_count_queries:
            response = self.client.query(query_dict['query'])
            result = response.result()
            if query_dict[
                    'destination_table_id'] in expected_dropped_row_count:
                expected_dropped_row_count[
                    query_dict['destination_table_id']] += result.total_rows
            else:
                expected_dropped_row_count[
                    query_dict['destination_table_id']] = result.total_rows

        # Separate check to find number of actual deleted rows
        q = rdpt.TABLE_ROWS_QUERY.format(dataset_id=self.dataset_id)
        q_result = self.client.query(q)
        row_count_before_retraction = {}
        for row in q_result:
            row_count_before_retraction[row['table_id']] = row['row_count']

        # Use query results to count number of expected dropped row deletions
        expected_kept_row_count = {}
        for query_dict in kept_row_count_queries:
            response = self.client.query(query_dict['query'])
            result = response.result()
            if query_dict['destination_table_id'] in expected_kept_row_count:
                expected_kept_row_count[
                    query_dict['destination_table_id']] -= (
                        (row_count_before_retraction[
                            query_dict['destination_table_id']] -
                         result.total_rows))
            else:
                expected_kept_row_count[query_dict['destination_table_id']] = (
                    row_count_before_retraction[
                        query_dict['destination_table_id']] -
                    (row_count_before_retraction[
                        query_dict['destination_table_id']] -
                     result.total_rows))

        # Perform retraction
        query_list = red.remove_ehr_data_queries(self.project_id,
                                                 self.ticket_number,
                                                 self.project_id,
                                                 self.dataset_id,
                                                 self.tablename)
        rdp.run_queries(query_list, self.client)

        # Find actual deleted rows
        q_result = self.client.query(q)
        results = q_result.result()
        row_count_after_retraction = {}
        for row in results:
            row_count_after_retraction[row['table_id']] = row['row_count']

        for table in expected_dropped_row_count:
            self.assertEqual(
                expected_dropped_row_count[table],
                row_count_before_retraction[table] -
                row_count_after_retraction[table])

        for table in expected_kept_row_count:
            self.assertEqual(expected_kept_row_count[table],
                             row_count_after_retraction[table])