def get_sandbox_queries(project_id, dataset_id, pids, ticket_number): """ Returns a list of queries of all tables to be added to the datasets sandbox. These tables include all rows from all effected tables that include PIDs that will be removed by a specific cleaning rule. :param project_id: bq project_id :param dataset_id: bq dataset_id :param pids: list of person_ids from cleaning rule that need to be sandboxed and removed :param ticket_number: ticket number from jira that will be appended to the end of the sandbox table names :return: list of CREATE OR REPLACE queries to create tables in sandbox """ person_tables_list = get_tables_with_person_id(project_id, dataset_id) queries_list = [] for table in person_tables_list: sandbox_queries = dict() sandbox_queries[cdr_consts.QUERY] = SANDBOX_QUERY.format( dataset=dataset_id, project=project_id, table=table, sandbox_dataset=get_sandbox_dataset_id(dataset_id), intermediary_table=table + '_' + ticket_number, # need to convert list of pids to string of pids pids=','.join([str(i) for i in pids])) queries_list.append(sandbox_queries) return queries_list
def main(project_id: str, bucket_name: str, dst_dataset_id: str): """ Load and transform vocabulary files in GCS to a BigQuery dataset :param project_id: :param bucket_name: refers to the bucket containing vocabulary files :param dst_dataset_id: final destination to load the vocabulary in BigQuery """ bq_client = bq.get_client(project_id) gcs_client = storage.Client(project_id) sandbox_dataset_id = get_sandbox_dataset_id(dst_dataset_id) sandbox_dataset = bq.create_dataset( project_id, sandbox_dataset_id, f'Vocabulary loaded from gs://{bucket_name}', label_or_tag={'type': 'vocabulary'}, overwrite_existing=True) stage_jobs = load_stage(sandbox_dataset, bq_client, bucket_name, gcs_client) wait_jobs(stage_jobs) load_jobs = load(project_id, bq_client, sandbox_dataset_id, dst_dataset_id, overwrite_ok=True) wait_jobs(load_jobs)
def setUp(self): self.project_id = app_identity.get_application_id() self.dataset_id = bq_utils.get_rdr_dataset_id() self.sandbox_dataset_id = sandbox.get_sandbox_dataset_id( self.dataset_id) sandbox.check_and_create_sandbox_dataset(self.project_id, self.dataset_id)
def get_remove_operational_pii_fields_query(project_id, dataset_id, sandbox_dataset_id): """ :param project_id: Name of the project :param dataset_id: Name of the dataset where the queries should be run :param sandbox_dataset_id: Name of the sandbox dataset :return: """ # fetch sandbox_dataset_id if sandbox_dataset_id is None: sandbox_dataset_id = sandbox.get_sandbox_dataset_id(dataset_id) load_operational_pii_fields_lookup_table( project_id=project_id, sandbox_dataset_id=sandbox_dataset_id) queries_list = [] # Save operational pii records being deleted in sandbox `dataset.intermediary_table` . query = dict() query[cdr_consts.QUERY] = _get_intermediary_table_query( dataset_id, project_id, sandbox_dataset_id) queries_list.append(query) # Delete operational pii records from observation table query = dict() query[cdr_consts.QUERY] = _get_delete_query(dataset_id, project_id, sandbox_dataset_id) queries_list.append(query) return queries_list
def setUp(self): self.project_id = app_identity.get_application_id() self.dataset_id = os.environ.get('UNIONED_DATASET_ID') self.sandbox_id = sandbox.get_sandbox_dataset_id(self.dataset_id) # Removing any existing datasets that might interfere with the test self.client = get_client(self.project_id) self.client.delete_dataset(f'{self.project_id}.{self.sandbox_id}', delete_contents=True, not_found_ok=True)
def get_remove_invalid_procedure_source_queries(project_id, dataset_id, sandbox_dataset_id=None): """ runs the query which removes records that contain incorrect values in the procedure_source_concept_id field invalid procedure_source_concept_ids are where it is not in the procedure domain and procedure_concept_id is not standard in the procedure domain :param project_id: Name of the project :param dataset_id: Name of the dataset where the queries should be run :param sandbox_dataset_id: Identifies the sandbox dataset to store rows #TODO use sandbox_dataset_id for CR :return: """ queries_list = [] # queries to sandbox invalid_records = dict() invalid_records[ cdr_consts.QUERY] = INVALID_PROCEDURE_SOURCE_CONCEPT_IDS_QUERY.format( project=project_id, dataset=dataset_id, table=TABLE, sandbox_dataset=get_sandbox_dataset_id(dataset_id), intermediary_table=INTERMEDIARY_TABLE_NAME) queries_list.append(invalid_records) # queries to delete invalid procedure source records valid_records = VALID_PROCEDURE_SOURCE_CONCEPT_IDS_QUERY.format( project=project_id, dataset=dataset_id, table=TABLE, sandbox_dataset=get_sandbox_dataset_id(dataset_id), intermediary_table=INTERMEDIARY_TABLE_NAME) queries_list.append({ clean_consts.QUERY: valid_records, clean_consts.DESTINATION_TABLE: TABLE, clean_consts.DESTINATION_DATASET: dataset_id, clean_consts.DISPOSITION: bq_consts.WRITE_TRUNCATE }) return queries_list
def setUp(self): self.project_id = app_identity.get_application_id() if 'test' not in self.project_id: raise RuntimeError( f"Make sure the project_id is set to test. Project_id is {self.project_id}" ) self.dataset_id = os.environ.get('UNIONED_DATASET_ID') self.deact_dataset_id = os.environ.get('COMBINED_DATASET_ID') self.client = bq.get_client(self.project_id) self.bq_sandbox_dataset_id = sb.get_sandbox_dataset_id(self.dataset_id) self.tables = {**TABLE_ROWS, **MAPPING_TABLE_ROWS, **EXT_TABLE_ROWS} self.setup_data()
def get_queries_clean_smoking(project_id, dataset_id, sandbox_dataset_id): """ Queries to run for deleting incorrect smoking rows and inserting corrected rows :param project_id: project id associated with the dataset to run the queries on :param dataset_id: dataset id to run the queries on :param sandbox_dataset_id: dataset id of the sandbox :return: list of query dicts """ queries = [] # fetch sandbox_dataset_id if sandbox_dataset_id is None: sandbox_dataset_id = sandbox.get_sandbox_dataset_id(dataset_id) load_smoking_lookup_table(project_id, sandbox_dataset_id) sandbox_query = dict() sandbox_query[cdr_consts.QUERY] = SANDBOX_CREATE_QUERY.format( project_id=project_id, dataset_id=dataset_id, sandbox_dataset_id=sandbox_dataset_id, new_smoking_rows=NEW_SMOKING_ROWS, smoking_lookup_table=SMOKING_LOOKUP_TABLE) queries.append(sandbox_query) delete_query = dict() delete_query[cdr_consts.QUERY] = DELETE_INCORRECT_RECORDS.format( project_id=project_id, dataset_id=dataset_id, sandbox_dataset_id=sandbox_dataset_id, smoking_lookup_table=SMOKING_LOOKUP_TABLE) queries.append(delete_query) insert_query = dict() insert_query[cdr_consts.QUERY] = INSERT_CORRECTED_RECORDS.format( project_id=project_id, dataset_id=dataset_id, sandbox_dataset_id=sandbox_dataset_id, new_smoking_rows=NEW_SMOKING_ROWS) queries.append(insert_query) return queries
def setUpClass(cls): print('**************************************************************') print(cls.__name__) print('**************************************************************') super().initialize_class_vars() project_id = app_identity.get_application_id() dataset_id = bq_utils.get_rdr_dataset_id() sandbox_dataset_id = sandbox.get_sandbox_dataset_id(dataset_id) rule = PpiBranching(project_id, dataset_id, sandbox_dataset_id) cls.dataset_id = dataset_id cls.sandbox_dataset_id = sandbox_dataset_id cls.project_id = project_id cls.rule_instance = rule cls.fq_sandbox_table_names = [ _fq_table_name(table) for table in (rule.lookup_table, rule.backup_table) ] cls.fq_table_names = [_fq_table_name(rule.observation_table)] super().setUpClass()
def delete_sandbox(self): sandbox_dataset_id = sandbox.get_sandbox_dataset_id(self.dataset_id) self.client.delete_dataset(sandbox_dataset_id, delete_contents=True, not_found_ok=True)