Esempio n. 1
0
    def test_merge_with_unmatched_schema(self):
        running_jobs = []
        with open(NYC_FIVE_PERSONS_MEASUREMENT_CSV, 'rb') as fp:
            gcs_utils.upload_object(gcs_utils.get_hpo_bucket('nyc'), 'measurement.csv', fp)
        result = bq_utils.load_cdm_csv('nyc', 'measurement')
        running_jobs.append(result['jobReference']['jobId'])

        with open(PITT_FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
            gcs_utils.upload_object(gcs_utils.get_hpo_bucket('pitt'), 'person.csv', fp)
        result = bq_utils.load_cdm_csv('pitt', 'person')
        running_jobs.append(result['jobReference']['jobId'])

        incomplete_jobs = bq_utils.wait_on_jobs(running_jobs)
        self.assertEqual(len(incomplete_jobs), 0, 'loading tables {},{} timed out'.format('nyc_measurement', 'pitt_person'))

        table_names = ['nyc_measurement', 'pitt_person']
        success, error = bq_utils.merge_tables(
          bq_utils.get_dataset_id(),
          table_names,
          bq_utils.get_dataset_id(),
          'merged_nyc_pitt'
        )
        self.assertFalse(success)
Esempio n. 2
0
 def test_query_result(self):
     sc_bucket = self.client.get_bucket(self.hpo_bucket)
     bucket_blob = sc_bucket.blob('person.csv')
     with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
         bucket_blob.upload_from_file(fp)
     result = bq_utils.load_cdm_csv(FAKE_HPO_ID, common.PERSON)
     load_job_id = result['jobReference']['jobId']
     incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])
     self.assertEqual(len(incomplete_jobs), 0,
                      'loading table {} timed out'.format(common.PERSON))
     table_id = bq_utils.get_table_id(FAKE_HPO_ID, common.PERSON)
     q = 'SELECT person_id FROM %s' % table_id
     result = bq_utils.query(q)
     self.assertEqual(5, int(result['totalRows']))
Esempio n. 3
0
    def test_query_result(self):
        with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
            gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp)
        result = bq_utils.load_cdm_csv(FAKE_HPO_ID, PERSON)

        load_job_id = result['jobReference']['jobId']
        incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])
        self.assertEqual(len(incomplete_jobs), 0,
                         'loading table {} timed out'.format(PERSON))

        table_id = bq_utils.get_table_id(FAKE_HPO_ID, PERSON)
        q = 'SELECT person_id FROM %s' % table_id
        result = bq_utils.query(q)
        self.assertEqual(5, int(result['totalRows']))
Esempio n. 4
0
    def _load_dataset(self, hpo_id):
        for cdm_table in resources.CDM_TABLES:

            cdm_filename: str = f'{cdm_table}.csv'
            cdm_filepath: str = os.path.join(test_util.FIVE_PERSONS_PATH,
                                             cdm_filename)

            bucket = self.storage_client.get_bucket(self.hpo_bucket)
            cdm_blob = bucket.blob(cdm_filename)
            if os.path.exists(cdm_filepath):
                cdm_blob.upload_from_filename(cdm_filepath)
            else:
                cdm_blob.upload_from_string('dummy\n')

            bq_utils.load_cdm_csv(hpo_id, cdm_table)

        # ensure concept table exists
        if not bq_utils.table_exists(common.CONCEPT):
            bq_utils.create_standard_table(common.CONCEPT, common.CONCEPT)
            q = """INSERT INTO {dataset}.concept
            SELECT * FROM {vocab}.concept""".format(
                dataset=self.dataset, vocab=common.VOCABULARY_DATASET)
            bq_utils.query(q)
Esempio n. 5
0
 def _load_datasets(self):
     load_jobs = []
     for cdm_table in common.CDM_TABLES:
         cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH,
                                      cdm_table + '.csv')
         if os.path.exists(cdm_file_name):
             test_util.write_cloud_file(self.chs_bucket, cdm_file_name)
             test_util.write_cloud_file(self.pitt_bucket, cdm_file_name)
         else:
             test_util.write_cloud_str(self.chs_bucket, cdm_table + '.csv',
                                       'dummy\n')
             test_util.write_cloud_str(self.pitt_bucket, cdm_table + '.csv',
                                       'dummy\n')
         chs_load_results = bq_utils.load_cdm_csv(CHS_HPO_ID, cdm_table)
         pitt_load_results = bq_utils.load_cdm_csv(PITT_HPO_ID, cdm_table)
         chs_load_job_id = chs_load_results['jobReference']['jobId']
         pitt_load_job_id = pitt_load_results['jobReference']['jobId']
         load_jobs.append(chs_load_job_id)
         load_jobs.append(pitt_load_job_id)
     incomplete_jobs = bq_utils.wait_on_jobs(load_jobs)
     if len(incomplete_jobs) > 0:
         raise RuntimeError('BigQuery jobs %s failed to complete' %
                            incomplete_jobs)
Esempio n. 6
0
    def test_load_cdm_csv(self):
        with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
            gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp)
        result = bq_utils.load_cdm_csv(FAKE_HPO_ID, PERSON)
        self.assertEqual(result['status']['state'], 'RUNNING')

        load_job_id = result['jobReference']['jobId']
        table_id = result['configuration']['load']['destinationTable'][
            'tableId']
        incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])
        self.assertEqual(len(incomplete_jobs), 0,
                         'loading table {} timed out'.format(table_id))
        table_info = bq_utils.get_table_info(table_id)
        num_rows = table_info.get('numRows')
        self.assertEqual(num_rows, '5')
Esempio n. 7
0
    def test_load_cdm_csv(self):
        with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
            gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp)
        result = bq_utils.load_cdm_csv(FAKE_HPO_ID, PERSON)
        self.assertEqual(result['status']['state'], 'RUNNING')

        load_job_id = result['jobReference']['jobId']
        table_id = result['configuration']['load']['destinationTable'][
            'tableId']
        incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])
        self.assertEqual(len(incomplete_jobs), 0,
                         'loading table {} timed out'.format(table_id))
        query_response = bq_utils.query('SELECT 1 FROM %(table_id)s' %
                                        locals())
        self.assertEqual(query_response['totalRows'], '5')
Esempio n. 8
0
    def _load_datasets(self):
        """
        Load five persons data for each test hpo
        # expected_tables is for testing output
        # it maps table name to list of expected records ex: "unioned_ehr_visit_occurrence" -> [{}, {}, ...]
        """
        expected_tables = dict()
        running_jobs = []
        for cdm_table in resources.CDM_TABLES:
            output_table = ehr_union.output_table_for(cdm_table)
            expected_tables[output_table] = []
            for hpo_id in self.hpo_ids:
                # upload csv into hpo bucket
                if hpo_id == NYC_HPO_ID:
                    cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH,
                                                 cdm_table + '.csv')
                else:
                    cdm_file_name = os.path.join(
                        test_util.PITT_FIVE_PERSONS_PATH, cdm_table + '.csv')
                bucket = gcs_utils.get_hpo_bucket(hpo_id)
                if os.path.exists(cdm_file_name):
                    test_util.write_cloud_file(bucket, cdm_file_name)
                    csv_rows = resources.csv_to_list(cdm_file_name)
                else:
                    # results in empty table
                    test_util.write_cloud_str(bucket, cdm_table + '.csv',
                                              'dummy\n')
                    csv_rows = []
                # load table from csv
                result = bq_utils.load_cdm_csv(hpo_id, cdm_table)
                running_jobs.append(result['jobReference']['jobId'])
                expected_tables[output_table] += list(csv_rows)
        # ensure person to observation output is as expected
        output_table_person = ehr_union.output_table_for(
            combine_ehr_rdr.PERSON_TABLE)
        output_table_observation = ehr_union.output_table_for(
            combine_ehr_rdr.OBSERVATION_TABLE)
        expected_tables[output_table_observation] += 4 * expected_tables[
            output_table_person]

        incomplete_jobs = bq_utils.wait_on_jobs(running_jobs)
        if len(incomplete_jobs) > 0:
            message = "Job id(s) %s failed to complete" % incomplete_jobs
            raise RuntimeError(message)
        self.expected_tables = expected_tables
Esempio n. 9
0
 def test_load_ehr_observation(self):
     hpo_id = 'pitt'
     dataset_id = bq_utils.get_dataset_id()
     table_id = bq_utils.get_table_id(hpo_id, table_name='observation')
     q = 'SELECT observation_id FROM {dataset_id}.{table_id} ORDER BY observation_id'.format(
         dataset_id=dataset_id,
         table_id=table_id)
     expected_observation_ids = [int(row['observation_id'])
                                 for row in resources._csv_to_list(PITT_FIVE_PERSONS_OBSERVATION_CSV)]
     with open(PITT_FIVE_PERSONS_OBSERVATION_CSV, 'rb') as fp:
         gcs_utils.upload_object(gcs_utils.get_hpo_bucket(hpo_id), 'observation.csv', fp)
     result = bq_utils.load_cdm_csv(hpo_id, 'observation')
     job_id = result['jobReference']['jobId']
     incomplete_jobs = bq_utils.wait_on_jobs([job_id])
     self.assertEqual(len(incomplete_jobs), 0, 'pitt_observation load job did not complete')
     load_job_result = bq_utils.get_job_details(job_id)
     load_job_result_status = load_job_result['status']
     load_job_errors = load_job_result_status.get('errors')
     self.assertIsNone(load_job_errors, msg='pitt_observation load job failed: ' + str(load_job_errors))
     query_results_response = bq_utils.query(q)
     query_job_errors = query_results_response.get('errors')
     self.assertIsNone(query_job_errors)
     actual_result = [int(row['f'][0]['v']) for row in query_results_response['rows']]
     self.assertListEqual(actual_result, expected_observation_ids)
Esempio n. 10
0
 def test_load_cdm_csv_error_on_bad_table_name(self):
     with self.assertRaises(ValueError) as cm:
         bq_utils.load_cdm_csv(FAKE_HPO_ID, 'not_a_cdm_table')
Esempio n. 11
0
    def test_integration_queries_to_retract_from_fake_dataset(
            self, mock_list_datasets, mock_is_ehr_dataset,
            mock_is_unioned_dataset, mock_is_combined_dataset,
            mock_is_deid_dataset):
        mock_list_datasets.return_value = [{
            'id':
            self.project_id + ':' + self.bq_dataset_id
        }]
        mock_is_deid_dataset.return_value = False
        mock_is_combined_dataset.return_value = False
        mock_is_unioned_dataset.return_value = False
        mock_is_ehr_dataset.return_value = True

        # create and load person_ids to pid table
        bq_utils.create_table(self.pid_table_id,
                              retract_data_bq.PID_TABLE_FIELDS,
                              drop_existing=True,
                              dataset_id=self.bq_dataset_id)
        bq_formatted_insert_values = ', '.join([
            '(%s, %s)' % (person_id, research_id)
            for (person_id, research_id) in self.person_research_ids
        ])
        q = INSERT_PID_TABLE.format(
            dataset_id=self.bq_dataset_id,
            pid_table_id=self.pid_table_id,
            person_research_ids=bq_formatted_insert_values)
        bq_utils.query(q)

        job_ids = []
        row_count_queries = {}
        # load the cdm files into dataset
        for cdm_file in test_util.NYC_FIVE_PERSONS_FILES:
            cdm_file_name = os.path.basename(cdm_file)
            cdm_table = cdm_file_name.split('.')[0]
            hpo_table = bq_utils.get_table_id(self.hpo_id, cdm_table)
            # store query for checking number of rows to delete
            row_count_queries[hpo_table] = EXPECTED_ROWS_QUERY.format(
                dataset_id=self.bq_dataset_id,
                table_id=hpo_table,
                pid_table_id=self.pid_table_id)
            retract_data_bq.logger.info('Preparing to load table %s.%s' %
                                        (self.bq_dataset_id, hpo_table))
            with open(cdm_file, 'rb') as f:
                gcs_utils.upload_object(gcs_utils.get_hpo_bucket(self.hpo_id),
                                        cdm_file_name, f)
            result = bq_utils.load_cdm_csv(self.hpo_id,
                                           cdm_table,
                                           dataset_id=self.bq_dataset_id)
            retract_data_bq.logger.info('Loading table %s.%s' %
                                        (self.bq_dataset_id, hpo_table))
            job_id = result['jobReference']['jobId']
            job_ids.append(job_id)
        incomplete_jobs = bq_utils.wait_on_jobs(job_ids)
        self.assertEqual(len(incomplete_jobs), 0,
                         'NYC five person load job did not complete')
        retract_data_bq.logger.info('All tables loaded successfully')

        # use query results to count number of expected row deletions
        expected_row_count = {}
        for table in row_count_queries:
            result = bq_utils.query(row_count_queries[table])
            expected_row_count[table] = int(result['totalRows'])

        # separate check to find number of actual deleted rows
        q = TABLE_ROWS_QUERY.format(dataset_id=self.bq_dataset_id)
        q_result = bq_utils.query(q)
        result = bq_utils.response2rows(q_result)
        row_count_before_retraction = {}
        for row in result:
            row_count_before_retraction[row['table_id']] = row['row_count']

        # perform retraction
        retract_data_bq.run_bq_retraction(self.test_project_id,
                                          self.bq_dataset_id,
                                          self.test_project_id,
                                          self.pid_table_id, self.hpo_id,
                                          self.dataset_ids)

        # find actual deleted rows
        q_result = bq_utils.query(q)
        result = bq_utils.response2rows(q_result)
        row_count_after_retraction = {}
        for row in result:
            row_count_after_retraction[row['table_id']] = row['row_count']
        for table in expected_row_count:
            self.assertEqual(
                expected_row_count[table], row_count_before_retraction[table] -
                row_count_after_retraction[table])
Esempio n. 12
0
def run_validation(hpo_id, force_run=False):
    """
    runs validation for a single hpo_id

    :param hpo_id: which hpo_id to run for
    :param force_run: if True, process the latest submission whether or not it has already been processed before
    :raises
    BucketDoesNotExistError:
      Raised when a configured bucket does not exist
    InternalValidationError:
      Raised when an internal error is encountered during validation
    """
    logging.info(' Validating hpo_id %s' % hpo_id)
    bucket = gcs_utils.get_hpo_bucket(hpo_id)
    bucket_items = list_bucket(bucket)
    to_process_folder_list = _get_to_process_list(bucket, bucket_items,
                                                  force_run)

    for folder_prefix in to_process_folder_list:
        logging.info('Processing gs://%s/%s' % (bucket, folder_prefix))
        # separate cdm from the unknown (unexpected) files
        found_cdm_files = []
        unknown_files = []
        folder_items = [
            item['name'].split('/')[1] for item in bucket_items
            if item['name'].startswith(folder_prefix)
        ]
        for item in folder_items:
            if _is_cdm_file(item):
                found_cdm_files.append(item)
            else:
                is_known_file = item in common.IGNORE_LIST or is_pii(item)
                if not is_known_file:
                    unknown_files.append(item)

        errors = []
        results = []
        found_cdm_file_names = found_cdm_files

        # Create all tables first to simplify downstream processes
        # (e.g. ehr_union doesn't have to check if tables exist)
        for cdm_file_name in common.CDM_FILES:
            cdm_table_name = cdm_file_name.split('.')[0]
            table_id = bq_utils.get_table_id(hpo_id, cdm_table_name)
            bq_utils.create_standard_table(cdm_table_name,
                                           table_id,
                                           drop_existing=True)

        for cdm_file_name in common.CDM_FILES:
            logging.info('Validating file `{file_name}`'.format(
                file_name=cdm_file_name))
            found = parsed = loaded = 0
            cdm_table_name = cdm_file_name.split('.')[0]

            if cdm_file_name in found_cdm_file_names:
                found = 1
                load_results = bq_utils.load_cdm_csv(hpo_id, cdm_table_name,
                                                     folder_prefix)
                load_job_id = load_results['jobReference']['jobId']
                incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])

                if len(incomplete_jobs) == 0:
                    job_resource = bq_utils.get_job_details(job_id=load_job_id)
                    job_status = job_resource['status']
                    if 'errorResult' in job_status:
                        # These are issues (which we report back) as opposed to internal errors
                        issues = [
                            item['message'] for item in job_status['errors']
                        ]
                        errors.append((cdm_file_name, ' || '.join(issues)))
                        logging.info(
                            'Issues found in gs://{bucket}/{folder_prefix}/{cdm_file_name}'
                            .format(bucket=bucket,
                                    folder_prefix=folder_prefix,
                                    cdm_file_name=cdm_file_name))
                        for issue in issues:
                            logging.info(issue)
                    else:
                        # Processed ok
                        parsed = loaded = 1
                else:
                    # Incomplete jobs are internal unrecoverable errors.
                    # Aborting the process allows for this submission to be validated when system recovers.
                    message_fmt = 'Loading hpo_id `%s` table `%s` failed because job id `%s` did not complete.'
                    message = message_fmt % (hpo_id, cdm_table_name,
                                             load_job_id)
                    message += ' Aborting processing `gs://%s/%s`.' % (
                        bucket, folder_prefix)
                    logging.error(message)
                    raise InternalValidationError(message)

            if cdm_file_name in common.REQUIRED_FILES or found:
                results.append((cdm_file_name, found, parsed, loaded))

        # (filename, message) for each unknown file
        warnings = [(unknown_file, UNKNOWN_FILE)
                    for unknown_file in unknown_files]

        # output to GCS
        _save_result_in_gcs(bucket, folder_prefix + RESULT_CSV, results)
        _save_warnings_in_gcs(bucket, folder_prefix + WARNINGS_CSV, warnings)
        _save_errors_in_gcs(bucket, folder_prefix + ERRORS_CSV, errors)

        if all_required_files_loaded(hpo_id, folder_prefix=folder_prefix):
            run_achilles(hpo_id)
            run_export(hpo_id=hpo_id, folder_prefix=folder_prefix)

        logging.info('Uploading achilles index files to `gs://%s/%s`.' %
                     (bucket, folder_prefix))
        _upload_achilles_files(hpo_id, folder_prefix)

        now_datetime_string = datetime.datetime.now().strftime(
            '%Y-%m-%dT%H:%M:%S')
        logging.info(
            'Processing complete. Saving timestamp %s to `gs://%s/%s`.' %
            (bucket, now_datetime_string,
             folder_prefix + common.PROCESSED_TXT))
        _write_string_to_file(bucket, folder_prefix + common.PROCESSED_TXT,
                              now_datetime_string)
    def test_integration_queries_to_retract_from_fake_dataset(
        self, mock_retraction_info):
        d = {
            'project_id': [
                self.project_id, self.project_id, self.project_id,
                self.project_id, self.project_id, self.project_id
            ],
            'dataset_id': [
                self.bq_dataset_id, self.bq_dataset_id, self.bq_dataset_id,
                self.bq_dataset_id, self.bq_dataset_id, self.bq_dataset_id
            ],
            'table': [
                'fake_condition_occurrence', 'fake_drug_exposure',
                'fake_measurement', 'fake_observation',
                'fake_procedure_occurrence', 'fake_visit_occurrence'
            ],
            'date_column': [
                None, None, 'measurement_date', 'observation_date',
                'procedure_date', None
            ],
            'start_date_column': [
                'condition_start_date', 'drug_exposure_start_date', None, None,
                None, 'visit_start_date'
            ],
            'end_date_column': [
                'condition_end_date', 'drug_exposure_end_date', None, None,
                None, 'visit_end_date'
            ]
        }
        retraction_info = pd.DataFrame(data=d)
        mock_retraction_info.return_value = retraction_info

        # Create and load person_ids and deactivated_date to pid table
        bq.create_tables(self.client,
                         self.project_id,
                         self.pid_table_id_list,
                         exists_ok=False,
                         fields=retract_deactivated_pids.PID_TABLE_FIELDS)
        bq_formatted_insert_values = ', '.join([
            '(%s, "%s")' % (person_id, deactivated_date)
            for (person_id,
                 deactivated_date) in self.deactivated_ehr_participants
        ])
        q = INSERT_PID_TABLE.format(
            dataset_id=self.bq_dataset_id,
            pid_table_id=self.pid_table_id,
            person_research_ids=bq_formatted_insert_values)
        self.client.query(q)

        job_ids = []
        dropped_row_count_queries = []
        kept_row_count_queries = []
        hpo_table_list = []

        # Load the cdm files into dataset
        for cdm_file in test_util.NYC_FIVE_PERSONS_FILES:
            cdm_file_name = os.path.basename(cdm_file)
            cdm_table = cdm_file_name.split('.')[0]
            hpo_table = bq_utils.get_table_id(self.hpo_id, cdm_table)
            # Do not process if person table
            if hpo_table == 'fake_person':
                continue
            hpo_table_list.append(hpo_table)
            logging.info(
                f'Preparing to load table {self.bq_dataset_id}.{hpo_table}')
            with open(cdm_file, 'rb') as f:
                gcs_utils.upload_object(gcs_utils.get_hpo_bucket(self.hpo_id),
                                        cdm_file_name, f)
            result = bq_utils.load_cdm_csv(self.hpo_id,
                                           cdm_table,
                                           dataset_id=self.bq_dataset_id)
            logging.info(f'Loading table {self.bq_dataset_id}.{hpo_table}')
            job_id = result['jobReference']['jobId']
            job_ids.append(job_id)

        incomplete_jobs = bq_utils.wait_on_jobs(job_ids)
        self.assertEqual(len(incomplete_jobs), 0,
                         'NYC five person load job did not complete')
        logging.info('All tables loaded successfully')

        # Store query for checking number of rows to delete
        for ehr in self.deactivated_ehr_participants:
            pid = ehr[0]
            for row in retraction_info.itertuples(index=False):
                if row.date_column is None:
                    dropped_query = EXPECTED_DROPPED_ROWS_QUERY_END_DATE.format(
                        dataset_id=self.bq_dataset_id,
                        table_id=row.table,
                        pid_table_id=self.pid_table_id,
                        pid=pid,
                        start_date_column=row.start_date_column,
                        end_date_column=row.end_date_column)
                    kept_query = EXPECTED_KEPT_ROWS_QUERY_END_DATE.format(
                        dataset_id=self.bq_dataset_id,
                        table_id=row.table,
                        pid_table_id=self.pid_table_id,
                        pid=pid,
                        start_date_column=row.start_date_column,
                        end_date_column=row.end_date_column)
                else:
                    dropped_query = EXPECTED_DROPPED_ROWS_QUERY.format(
                        dataset_id=self.bq_dataset_id,
                        table_id=row.table,
                        pid_table_id=self.pid_table_id,
                        pid=pid,
                        date_column=row.date_column)
                    kept_query = EXPECTED_KEPT_ROWS_QUERY.format(
                        dataset_id=self.bq_dataset_id,
                        table_id=row.table,
                        pid_table_id=self.pid_table_id,
                        pid=pid,
                        date_column=row.date_column)
                dropped_row_count_queries.append({
                    clean_consts.QUERY: dropped_query,
                    clean_consts.DESTINATION_DATASET: self.bq_dataset_id,
                    clean_consts.DESTINATION_TABLE: row.table
                })
                kept_row_count_queries.append({
                    clean_consts.QUERY: kept_query,
                    clean_consts.DESTINATION_DATASET: self.bq_dataset_id,
                    clean_consts.DESTINATION_TABLE: row.table
                })

        # Use query results to count number of expected dropped row deletions
        expected_dropped_row_count = {}
        for query_dict in dropped_row_count_queries:
            response = self.client.query(query_dict['query'])
            result = response.result()
            if query_dict['destination_table_id'] in expected_dropped_row_count:
                expected_dropped_row_count[
                    query_dict['destination_table_id']] += result.total_rows
            else:
                expected_dropped_row_count[
                    query_dict['destination_table_id']] = result.total_rows

        # Separate check to find number of actual deleted rows
        q = TABLE_ROWS_QUERY.format(dataset_id=self.bq_dataset_id)
        q_result = self.client.query(q)
        row_count_before_retraction = {}
        for row in q_result:
            row_count_before_retraction[row['table_id']] = row['row_count']

        # Use query results to count number of expected dropped row deletions
        expected_kept_row_count = {}
        for query_dict in kept_row_count_queries:
            response = self.client.query(query_dict['query'])
            result = response.result()
            if query_dict['destination_table_id'] in expected_kept_row_count:
                expected_kept_row_count[query_dict['destination_table_id']] -= (
                    (row_count_before_retraction[
                        query_dict['destination_table_id']] -
                     result.total_rows))
            else:
                expected_kept_row_count[query_dict['destination_table_id']] = (
                    row_count_before_retraction[
                        query_dict['destination_table_id']] -
                    (row_count_before_retraction[
                        query_dict['destination_table_id']] -
                     result.total_rows))

        # Perform retraction
        query_list = retract_deactivated_pids.create_queries(
            self.project_id, self.ticket_number, self.project_id,
            self.bq_dataset_id, self.pid_table_id)
        retract_deactivated_pids.run_queries(query_list, self.client)

        # Find actual deleted rows
        q_result = self.client.query(q)
        results = q_result.result()
        row_count_after_retraction = {}
        for row in results:
            row_count_after_retraction[row['table_id']] = row['row_count']

        for table in expected_dropped_row_count:
            self.assertEqual(
                expected_dropped_row_count[table],
                row_count_before_retraction[table] -
                row_count_after_retraction[table])

        for table in expected_kept_row_count:
            self.assertEqual(expected_kept_row_count[table],
                             row_count_after_retraction[table])