def setup_rule(self, client=None): """ Load required resources prior to executing cleaning rule queries. Method to run data upload options before executing the first cleaning rule of a class. For example, if your class requires loading a static table, that load operation should be defined here. It SHOULD NOT BE defined as part of get_query_specs(). :param client: :return: :raises: BadRequest, OSError, AttributeError, TypeError, ValueError if the load job fails. Error raised from bq.upload_csv_data_to_bq_table helper function. """ # creating _unit_mapping table unit_mapping_table = (f'{self.project_id}.' f'{self.sandbox_dataset_id}.' f'{UNIT_MAPPING_TABLE}') bq.create_tables( client, self.project_id, [unit_mapping_table], ) # Uploading data to _unit_mapping table unit_mappings_csv_path = os.path.join(resources.resource_files_path, UNIT_MAPPING_FILE) result = bq.upload_csv_data_to_bq_table( client, self.sandbox_dataset_id, UNIT_MAPPING_TABLE, unit_mappings_csv_path, UNIT_MAPPING_TABLE_DISPOSITION) LOGGER.info( f"Created {self.sandbox_dataset_id}.{UNIT_MAPPING_TABLE} and " f"loaded data from {unit_mappings_csv_path}")
def setup_rule(self, client, *args, **keyword_args): # Create _logging_standard_concept_id_replacement fq_table_names = [ f'{self.project_id}.{self.sandbox_dataset_id}.{SRC_CONCEPT_ID_TABLE_NAME}' ] create_tables(client, self.project_id, fq_table_names, exists_ok=True)
def setUp(self): self.project_id = app_identity.get_application_id() self.dataset_id = os.environ.get('UNIONED_DATASET_ID') self.fq_person = f'{self.project_id}.{self.dataset_id}.person' self.fq_person_ext = f'{self.project_id}.{self.dataset_id}.person_ext' self.table_ids = [self.fq_person_ext, self.fq_person] self.person = gbq.Table.from_string(self.fq_person) self.person_ext = gbq.Table.from_string(self.fq_person_ext) self.client = bq.get_client(self.project_id) self.tearDown() for fq_table in self.table_ids: table_id = fq_table.split('.')[2] if table_id == 'person': bq_utils.create_standard_table(table_id, table_id, drop_existing=True, dataset_id=self.dataset_id) else: bq.create_tables(self.client, self.project_id, [fq_table], exists_ok=True) self.populate_tables()
def setup_rule(self, client=None): """ Load required resources prior to executing cleaning rule queries. Method to run data upload options before executing the first cleaning rule of a class. For example, if your class requires loading a static table, that load operation should be defined here. It SHOULD NOT BE defined as part of get_query_specs(). :param client: :return: """ # creating _unit_mapping table unit_mapping_table = f'{self.project_id}.{self.dataset_id}.{UNIT_MAPPING_TABLE}' bq.create_tables( client, self.project_id, [unit_mapping_table], ) # Uploading data to _unit_mapping table unit_mappings_csv_path = os.path.join(resources.resource_files_path, UNIT_MAPPING_FILE) job = bq.upload_csv_data_to_bq_table(client, self.dataset_id, UNIT_MAPPING_TABLE, unit_mappings_csv_path, UNIT_MAPPING_TABLE_DISPOSITION) job.result() LOGGER.info( f"Created {self.dataset_id}.{UNIT_MAPPING_TABLE} and loaded data from {unit_mappings_csv_path}" )
def setUpClass(cls): # get the test project if 'test' not in cls.project_id: raise RuntimeError( f'Tests should only run in a test environment. ' f'Current environment is {cls.project_id} .') if not cls.fq_table_names: raise RuntimeError( f'Provide a list of fully qualified table names the ' f'test will manipulate.') cls.client = bq.get_client(cls.project_id) # get or create datasets, cleaning rules can assume the datasets exist required_datasets = [] for table_name in cls.fq_table_names + cls.fq_sandbox_table_names: dataset_id = table_name.split('.')[1] required_datasets.append(dataset_id) desc = (f"dataset created by {cls.__name__} to test a " f"cleaning rule. deletion candidate.") for dataset_id in set(required_datasets): dataset = bq.define_dataset(cls.project_id, dataset_id, desc, {'test': ''}) cls.client.create_dataset(dataset, exists_ok=True) bq.create_tables(cls.client, cls.project_id, cls.fq_table_names, True)
def setUp(self): self.project_id = app_identity.get_application_id() self.dataset_id = os.environ.get('UNIONED_DATASET_ID') self.fq_person = f'{self.project_id}.{self.dataset_id}.person' self.fq_person_ext = f'{self.project_id}.{self.dataset_id}.person_ext' self.table_ids = [self.fq_person, self.fq_person_ext] self.person = Table.from_string(self.fq_person) self.person_ext = Table.from_string(self.fq_person_ext) self.client = bq.get_client(self.project_id) for table in self.table_ids: self.client.delete_table(table, not_found_ok=True) bq.create_tables(self.client, self.project_id, self.table_ids) self.populate_tables()
def get_domain_mapping_queries(project_id, dataset_id): """ This function generates a list of query dicts for creating id mappings in _logging_domain_alignment. The list will get consumed clean_engine :param project_id: the project_id in which the query is run :param dataset_id: the dataset_id in which the query is run :return: a list of query dicts for creating id mappings in _logging_domain_alignment """ # Create _logging_domain_alignment client = bq.get_client(project_id) table_id = f'{project_id}.{dataset_id}.{DOMAIN_ALIGNMENT_TABLE_NAME}' client.delete_table(table_id, not_found_ok=True) bq.create_tables(client, project_id, [table_id], exists_ok=False) domain_mapping_queries = [] for domain_table in domain_mapping.DOMAIN_TABLE_NAMES: query = parse_domain_mapping_query_cross_domain( project_id, dataset_id, domain_table) domain_mapping_queries.append(query) # Create the query for creating field_mappings for the records moving between the same domain query = parse_domain_mapping_query_for_same_domains(project_id, dataset_id) domain_mapping_queries.append(query) # Create the query for the records that are in the wrong domain but will not be moved query = parse_domain_mapping_query_for_excluded_records( project_id, dataset_id) domain_mapping_queries.append(query) unioned_query = { cdr_consts.QUERY: UNION_ALL.join(domain_mapping_queries), cdr_consts.DESTINATION_TABLE: DOMAIN_ALIGNMENT_TABLE_NAME, cdr_consts.DISPOSITION: bq_consts.WRITE_EMPTY, cdr_consts.DESTINATION_DATASET: dataset_id } return [unioned_query]
def setup_data(self): self.tearDown() # setup deactivated participants table deact_table_ref = gbq.TableReference.from_string(self.deact_table) bq.create_tables(self.client, self.project_id, [self.deact_table], exists_ok=True) job_config = gbq.QueryJobConfig() job = self.client.query( DEACTIVATED_PIDS.render(deact_table=deact_table_ref), job_config) job.result() # create omop tables and mapping/ext tables for table in self.tables: fq_table = f'{self.project_id}.{self.dataset_id}.{table}' bq.create_tables(self.client, self.project_id, [fq_table], exists_ok=True) table_ref = gbq.TableReference.from_string(fq_table) job_config = gbq.QueryJobConfig() job = self.client.query(self.tables[table].render(table=table_ref), job_config) job.result()
def setUp(self): bq.create_tables(self.client, self.project_id, self.fq_table_names, True)
def check_and_copy_tables(project_id, dataset_id): """ Will check that all the required tables exist and if not, they will be created or copied from another table. :param project_id: Project where the dataset resides :param dataset_id: Dataset where the required lab table needs to be created :return: None """ descendants_table_name = f'{project_id}.{dataset_id}.{MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE}' concept_sets_table_name = f'{project_id}.{dataset_id}.{MEASUREMENT_CONCEPT_SETS_TABLE}' client = bq.get_client(project_id) dataset = client.dataset(dataset_id) vocab_dataset = client.dataset(common.VOCABULARY_DATASET) # concept table and concept ancestor table source tables concept_source_table = vocab_dataset.table(common.CONCEPT) concept_ancestor_source_table = vocab_dataset.table(common.CONCEPT_ANCESTOR) # concept table and concept ancestor table destination tables concept_dest_table = dataset.table(common.CONCEPT) concept_ancestor_dest_table = dataset.table(common.CONCEPT_ANCESTOR) # query will check the row counts of each table in the specified dataset row_count_query = ROW_COUNT_QUERY.render(project_id=project_id, dataset_id=dataset_id) results_dataframe = client.query(row_count_query).to_dataframe() empty_results_dataframe = results_dataframe[( results_dataframe['row_count'] == 0)] # checks if CONCEPT and CONCEPT_ANCESTOR tables exist, if they don't, they are copied from the # CONCEPT and CONCEPT_ANCESTOR tables in common.VOCABULARY if common.CONCEPT not in (results_dataframe['table_id']).values: client.copy_table(concept_source_table, concept_dest_table) if common.CONCEPT_ANCESTOR not in (results_dataframe['table_id']).values: client.copy_table(concept_ancestor_source_table, concept_ancestor_dest_table) # checks if CONCEPT and CONCEPT_ANCESTOR tables are empty, if they are, they are copied from the CONCEPT and # CONCEPT_ANCESTOR tables in common.VOCABULARY if common.CONCEPT in (empty_results_dataframe['table_id']).values: client.copy_table(concept_source_table, concept_dest_table) if common.CONCEPT_ANCESTOR in (empty_results_dataframe['table_id']).values: client.copy_table(concept_ancestor_source_table, concept_ancestor_dest_table) # checks if MEASUREMENT_CONCEPT_SETS_TABLE and MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE exist, if they # do not exist, they will be created if MEASUREMENT_CONCEPT_SETS_TABLE not in results_dataframe[ 'table_id'].values: bq.create_tables(client=client, project_id=project_id, fq_table_names=[concept_sets_table_name], exists_ok=True, fields=None) if MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE not in results_dataframe[ 'table_id'].values: bq.create_tables(client=client, project_id=project_id, fq_table_names=[descendants_table_name], exists_ok=True, fields=None)
def test_integration_queries_to_retract_from_fake_dataset( self, mock_list_datasets, mock_is_ehr_dataset, mock_is_unioned_dataset, mock_is_combined_dataset, mock_is_deid_dataset): mock_list_datasets.return_value = [self.bq_dataset_id] mock_is_deid_dataset.return_value = False mock_is_combined_dataset.return_value = False mock_is_unioned_dataset.return_value = False mock_is_ehr_dataset.return_value = True # create and load person_ids to pid table bq.create_tables( self.client, self.test_project_id, [ f'{self.test_project_id}.{self.bq_dataset_id}.{self.pid_table_id}' ], exists_ok=False, fields=[rbq.PID_TABLE_FIELDS]) bq_formatted_insert_values = ', '.join([ f'({person_id}, {research_id})' for (person_id, research_id) in self.person_research_ids ]) q = INSERT_PID_TABLE.format( dataset_id=self.bq_dataset_id, pid_table_id=self.pid_table_id, person_research_ids=bq_formatted_insert_values) job = self.client.query(q) job.result() row_count_queries = {} # load the cdm files into dataset for cdm_file in test_util.NYC_FIVE_PERSONS_FILES: cdm_file_name = os.path.basename(cdm_file) cdm_table = cdm_file_name.split('.')[0] hpo_table = f'{self.hpo_id}_{cdm_table}' # store query for checking number of rows to delete row_count_queries[hpo_table] = EXPECTED_ROWS_QUERY.format( dataset_id=self.bq_dataset_id, table_id=hpo_table, pid_table_id=self.pid_table_id) logging.info( f'Preparing to load table {self.bq_dataset_id}.{hpo_table}') with open(cdm_file, 'rb') as f: job_config = bigquery.LoadJobConfig() job_config.source_format = bigquery.SourceFormat.CSV job_config.skip_leading_rows = 1 job_config.write_disposition = 'WRITE_EMPTY' job_config.schema = bq.get_table_schema(cdm_table) load_job = self.client.load_table_from_file( f, f'{self.test_project_id}.{self.bq_dataset_id}.{hpo_table}', job_config=job_config) load_job.result() logging.info('All tables loaded successfully') # use query results to count number of expected row deletions expected_row_count = {} for table in row_count_queries: job = self.client.query(row_count_queries[table]) result = job.result() expected_row_count[table] = result.to_dataframe()['count'].to_list( )[0] # separate check to find number of actual deleted rows q = TABLE_ROWS_QUERY.format(dataset_id=self.bq_dataset_id) job = self.client.query(q) result = job.result().to_dataframe() row_counts_before_retraction = pd.Series( result.row_count.values, index=result.table_id).to_dict() # perform retraction rbq.run_bq_retraction(self.test_project_id, self.bq_dataset_id, self.test_project_id, self.pid_table_id, self.hpo_id, self.dataset_ids, self.retraction_type) # find actual deleted rows job = self.client.query(q) result = job.result().to_dataframe() row_counts_after_retraction = pd.Series( result.row_count.values, index=result.table_id).to_dict() for table in expected_row_count: self.assertEqual( expected_row_count[table], row_counts_before_retraction[table] - row_counts_after_retraction[table])
def test_integration_queries_to_retract_from_fake_dataset( self, mock_retraction_info): d = { 'project_id': [ self.project_id, self.project_id, self.project_id, self.project_id, self.project_id, self.project_id ], 'dataset_id': [ self.bq_dataset_id, self.bq_dataset_id, self.bq_dataset_id, self.bq_dataset_id, self.bq_dataset_id, self.bq_dataset_id ], 'table': [ 'fake_condition_occurrence', 'fake_drug_exposure', 'fake_measurement', 'fake_observation', 'fake_procedure_occurrence', 'fake_visit_occurrence' ], 'date_column': [ None, None, 'measurement_date', 'observation_date', 'procedure_date', None ], 'start_date_column': [ 'condition_start_date', 'drug_exposure_start_date', None, None, None, 'visit_start_date' ], 'end_date_column': [ 'condition_end_date', 'drug_exposure_end_date', None, None, None, 'visit_end_date' ] } retraction_info = pd.DataFrame(data=d) mock_retraction_info.return_value = retraction_info # Create and load person_ids and deactivated_date to pid table bq.create_tables(self.client, self.project_id, self.pid_table_id_list, exists_ok=False, fields=retract_deactivated_pids.PID_TABLE_FIELDS) bq_formatted_insert_values = ', '.join([ '(%s, "%s")' % (person_id, deactivated_date) for (person_id, deactivated_date) in self.deactivated_ehr_participants ]) q = INSERT_PID_TABLE.format( dataset_id=self.bq_dataset_id, pid_table_id=self.pid_table_id, person_research_ids=bq_formatted_insert_values) self.client.query(q) job_ids = [] dropped_row_count_queries = [] kept_row_count_queries = [] hpo_table_list = [] # Load the cdm files into dataset for cdm_file in test_util.NYC_FIVE_PERSONS_FILES: cdm_file_name = os.path.basename(cdm_file) cdm_table = cdm_file_name.split('.')[0] hpo_table = bq_utils.get_table_id(self.hpo_id, cdm_table) # Do not process if person table if hpo_table == 'fake_person': continue hpo_table_list.append(hpo_table) logging.info( f'Preparing to load table {self.bq_dataset_id}.{hpo_table}') with open(cdm_file, 'rb') as f: gcs_utils.upload_object(gcs_utils.get_hpo_bucket(self.hpo_id), cdm_file_name, f) result = bq_utils.load_cdm_csv(self.hpo_id, cdm_table, dataset_id=self.bq_dataset_id) logging.info(f'Loading table {self.bq_dataset_id}.{hpo_table}') job_id = result['jobReference']['jobId'] job_ids.append(job_id) incomplete_jobs = bq_utils.wait_on_jobs(job_ids) self.assertEqual(len(incomplete_jobs), 0, 'NYC five person load job did not complete') logging.info('All tables loaded successfully') # Store query for checking number of rows to delete for ehr in self.deactivated_ehr_participants: pid = ehr[0] for row in retraction_info.itertuples(index=False): if row.date_column is None: dropped_query = EXPECTED_DROPPED_ROWS_QUERY_END_DATE.format( dataset_id=self.bq_dataset_id, table_id=row.table, pid_table_id=self.pid_table_id, pid=pid, start_date_column=row.start_date_column, end_date_column=row.end_date_column) kept_query = EXPECTED_KEPT_ROWS_QUERY_END_DATE.format( dataset_id=self.bq_dataset_id, table_id=row.table, pid_table_id=self.pid_table_id, pid=pid, start_date_column=row.start_date_column, end_date_column=row.end_date_column) else: dropped_query = EXPECTED_DROPPED_ROWS_QUERY.format( dataset_id=self.bq_dataset_id, table_id=row.table, pid_table_id=self.pid_table_id, pid=pid, date_column=row.date_column) kept_query = EXPECTED_KEPT_ROWS_QUERY.format( dataset_id=self.bq_dataset_id, table_id=row.table, pid_table_id=self.pid_table_id, pid=pid, date_column=row.date_column) dropped_row_count_queries.append({ clean_consts.QUERY: dropped_query, clean_consts.DESTINATION_DATASET: self.bq_dataset_id, clean_consts.DESTINATION_TABLE: row.table }) kept_row_count_queries.append({ clean_consts.QUERY: kept_query, clean_consts.DESTINATION_DATASET: self.bq_dataset_id, clean_consts.DESTINATION_TABLE: row.table }) # Use query results to count number of expected dropped row deletions expected_dropped_row_count = {} for query_dict in dropped_row_count_queries: response = self.client.query(query_dict['query']) result = response.result() if query_dict['destination_table_id'] in expected_dropped_row_count: expected_dropped_row_count[ query_dict['destination_table_id']] += result.total_rows else: expected_dropped_row_count[ query_dict['destination_table_id']] = result.total_rows # Separate check to find number of actual deleted rows q = TABLE_ROWS_QUERY.format(dataset_id=self.bq_dataset_id) q_result = self.client.query(q) row_count_before_retraction = {} for row in q_result: row_count_before_retraction[row['table_id']] = row['row_count'] # Use query results to count number of expected dropped row deletions expected_kept_row_count = {} for query_dict in kept_row_count_queries: response = self.client.query(query_dict['query']) result = response.result() if query_dict['destination_table_id'] in expected_kept_row_count: expected_kept_row_count[query_dict['destination_table_id']] -= ( (row_count_before_retraction[ query_dict['destination_table_id']] - result.total_rows)) else: expected_kept_row_count[query_dict['destination_table_id']] = ( row_count_before_retraction[ query_dict['destination_table_id']] - (row_count_before_retraction[ query_dict['destination_table_id']] - result.total_rows)) # Perform retraction query_list = retract_deactivated_pids.create_queries( self.project_id, self.ticket_number, self.project_id, self.bq_dataset_id, self.pid_table_id) retract_deactivated_pids.run_queries(query_list, self.client) # Find actual deleted rows q_result = self.client.query(q) results = q_result.result() row_count_after_retraction = {} for row in results: row_count_after_retraction[row['table_id']] = row['row_count'] for table in expected_dropped_row_count: self.assertEqual( expected_dropped_row_count[table], row_count_before_retraction[table] - row_count_after_retraction[table]) for table in expected_kept_row_count: self.assertEqual(expected_kept_row_count[table], row_count_after_retraction[table])
def test_remove_ehr_data_past_deactivation_date(self, mock_retraction_info, mock_get): # pre conditions for participant summary API module mock_get.return_value.status_code = 200 mock_get.return_value.json.return_value = self.json_response_entry # Ensure deactivated participants table is created and or updated df = psr.get_deactivated_participants(self.project_id, self.dataset_id, self.tablename, self.columns) psr.store_participant_data(df, self.project_id, f'{self.dataset_id}.{self.tablename}') # pre conditions for retraction module d = { 'project_id': [ self.project_id, self.project_id, self.project_id, self.project_id, self.project_id, self.project_id ], 'dataset_id': [ self.dataset_id, self.dataset_id, self.dataset_id, self.dataset_id, self.dataset_id, self.dataset_id ], 'table': [ 'condition_occurrence', 'drug_exposure', 'measurement', 'observation', 'procedure_occurrence', 'visit_occurrence' ], 'date_column': [ None, None, 'measurement_date', 'observation_date', 'procedure_date', None ], 'start_date_column': [ 'condition_start_date', 'drug_exposure_start_date', None, None, None, 'visit_start_date' ], 'end_date_column': [ 'condition_end_date', 'drug_exposure_end_date', None, None, None, 'visit_end_date' ] } retraction_info = pandas.DataFrame(data=d) mock_retraction_info.return_value = retraction_info load_data_queries = [] dropped_row_count_queries = [] kept_row_count_queries = [] sandbox_row_count_queries = [] # Queries to load the dummy data into the tables measurement_query = jinja_env.from_string(""" INSERT INTO `{{project}}.{{dataset}}.{{measurement}}` (measurement_id, person_id, measurement_concept_id, measurement_date, measurement_type_concept_id) VALUES (1234, 1, 0, date('2017-12-07'), 0), (5678, 2, 0, date('2017-12-07'), 0), (2345, 3, 0, date('2018-12-07'), 0)""").render( project=self.project_id, dataset=self.dataset_id, measurement=TABLES[0]) load_data_queries.append(measurement_query) observation_query = jinja_env.from_string(""" INSERT INTO `{{project}}.{{dataset}}.{{observation}}` (observation_id, person_id, observation_concept_id, observation_date, observation_type_concept_id) VALUES (1234, 1, 0, date('2017-12-07'), 0), (5678, 2, 0, date('2017-12-07'), 0), (2345, 3, 0, date('2018-12-07'), 0)""").render( project=self.project_id, dataset=self.dataset_id, observation=TABLES[1]) load_data_queries.append(observation_query) procedure_occ_query = jinja_env.from_string( """ INSERT INTO `{{project}}.{{dataset}}.{{procedure}}` (procedure_occurrence_id, person_id, procedure_concept_id, procedure_date, procedure_datetime, procedure_type_concept_id) VALUES (1234, 1, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), 0), (5678, 2, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), 0), (2345, 3, 0, date('2018-12-07'), timestamp('2018-12-07T08:21:14'), 0)""" ).render(project=self.project_id, dataset=self.dataset_id, procedure=TABLES[2]) load_data_queries.append(procedure_occ_query) condition_occ_query = jinja_env.from_string( """ INSERT INTO `{{project}}.{{dataset}}.{{condition}}` (condition_occurrence_id, person_id, condition_concept_id, condition_start_date, condition_start_datetime, condition_end_date, condition_type_concept_id) VALUES (1234, 1, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0), (5678, 2, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0), (2345, 3, 0, date('2018-12-07'), timestamp('2018-12-07T08:21:14'), date('2018-12-08'), 0)""" ).render(project=self.project_id, dataset=self.dataset_id, condition=TABLES[3]) load_data_queries.append(condition_occ_query) drug_query = jinja_env.from_string(""" INSERT INTO `{{project}}.{{dataset}}.{{drug}}` (drug_exposure_id, person_id, drug_concept_id, drug_exposure_start_date, drug_exposure_start_datetime, drug_exposure_end_date, drug_type_concept_id) VALUES (1234, 1, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0), (5678, 2, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0), (2345, 3, 0, date('2018-12-07'), timestamp('2018-12-07T08:21:14'), date('2018-12-08'), 0)""" ).render(project=self.project_id, dataset=self.dataset_id, drug=TABLES[4]) load_data_queries.append(drug_query) visit_query = jinja_env.from_string(""" INSERT INTO `{{project}}.{{dataset}}.{{visit}}` (visit_occurrence_id, person_id, visit_concept_id, visit_start_date, visit_start_datetime, visit_end_date, visit_type_concept_id) VALUES (1234, 1, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0), (5678, 2, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0), (2345, 3, 0, date('2018-12-07'), timestamp('2018-12-07T08:21:14'), date('2018-12-08'), 0)""" ).render(project=self.project_id, dataset=self.dataset_id, visit=TABLES[5]) load_data_queries.append(visit_query) # Create tables fq_table_names = [] for table_name in TABLES: fq_table_names.append( f'{self.project_id}.{self.dataset_id}.{table_name}') bq.create_tables(self.client, self.project_id, fq_table_names, exists_ok=True) # Load queries for query in load_data_queries: response = self.client.query(query) self.assertIsNotNone(response.result()) self.assertIsNone(response.exception()) # Store query for checking number of rows to delete for ehr in self.deactivated_participants: pid = ehr[0] for row in retraction_info.itertuples(index=False): if row.date_column is None: dropped_query = rdpt.EXPECTED_DROPPED_ROWS_QUERY_END_DATE.format( dataset_id=self.dataset_id, table_id=row.table, pid_table_id=self.tablename, pid=pid, start_date_column=row.start_date_column, end_date_column=row.end_date_column) kept_query = rdpt.EXPECTED_KEPT_ROWS_QUERY_END_DATE.format( dataset_id=self.dataset_id, table_id=row.table, pid_table_id=self.tablename, pid=pid, start_date_column=row.start_date_column, end_date_column=row.end_date_column) sandbox_query = rdp.SANDBOX_QUERY_END_DATE.render( project=self.project_id, dataset=self.dataset_id, table=row.table, pid=pid, end_date_column=row.end_date_column, start_date_column=row.start_date_column, deactivated_pids_project=self.project_id, deactivated_pids_dataset=self.dataset_id, deactivated_pids_table=self.tablename) else: dropped_query = rdpt.EXPECTED_DROPPED_ROWS_QUERY.format( dataset_id=self.dataset_id, table_id=row.table, pid_table_id=self.tablename, pid=pid, date_column=row.date_column) kept_query = rdpt.EXPECTED_KEPT_ROWS_QUERY.format( dataset_id=self.dataset_id, table_id=row.table, pid_table_id=self.tablename, pid=pid, date_column=row.date_column) sandbox_query = rdp.SANDBOX_QUERY_DATE.render( project=self.project_id, dataset=self.dataset_id, table=row.table, pid=pid, date_column=row.date_column, deactivated_pids_project=self.project_id, deactivated_pids_dataset=self.dataset_id, deactivated_pids_table=self.tablename) dropped_row_count_queries.append({ clean_consts.QUERY: dropped_query, clean_consts.DESTINATION_DATASET: self.dataset_id, clean_consts.DESTINATION_TABLE: row.table }) kept_row_count_queries.append({ clean_consts.QUERY: kept_query, clean_consts.DESTINATION_DATASET: self.dataset_id, clean_consts.DESTINATION_TABLE: row.table }) sandbox_row_count_queries.append({ clean_consts.QUERY: sandbox_query, clean_consts.DESTINATION_DATASET: self.sandbox_id, clean_consts.DESTINATION_TABLE: self.tablename }) # Use query results to count number of expected dropped row deletions expected_dropped_row_count = {} for query_dict in dropped_row_count_queries: response = self.client.query(query_dict['query']) result = response.result() if query_dict[ 'destination_table_id'] in expected_dropped_row_count: expected_dropped_row_count[ query_dict['destination_table_id']] += result.total_rows else: expected_dropped_row_count[ query_dict['destination_table_id']] = result.total_rows # Separate check to find number of actual deleted rows q = rdpt.TABLE_ROWS_QUERY.format(dataset_id=self.dataset_id) q_result = self.client.query(q) row_count_before_retraction = {} for row in q_result: row_count_before_retraction[row['table_id']] = row['row_count'] # Use query results to count number of expected dropped row deletions expected_kept_row_count = {} for query_dict in kept_row_count_queries: response = self.client.query(query_dict['query']) result = response.result() if query_dict['destination_table_id'] in expected_kept_row_count: expected_kept_row_count[ query_dict['destination_table_id']] -= ( (row_count_before_retraction[ query_dict['destination_table_id']] - result.total_rows)) else: expected_kept_row_count[query_dict['destination_table_id']] = ( row_count_before_retraction[ query_dict['destination_table_id']] - (row_count_before_retraction[ query_dict['destination_table_id']] - result.total_rows)) # Perform retraction query_list = red.remove_ehr_data_queries(self.project_id, self.ticket_number, self.project_id, self.dataset_id, self.tablename) rdp.run_queries(query_list, self.client) # Find actual deleted rows q_result = self.client.query(q) results = q_result.result() row_count_after_retraction = {} for row in results: row_count_after_retraction[row['table_id']] = row['row_count'] for table in expected_dropped_row_count: self.assertEqual( expected_dropped_row_count[table], row_count_before_retraction[table] - row_count_after_retraction[table]) for table in expected_kept_row_count: self.assertEqual(expected_kept_row_count[table], row_count_after_retraction[table])