def setUp(self): self.project_id = app_identity.get_application_id() self.dataset_id = os.environ.get('UNIONED_DATASET_ID') self.sandbox_id = sandbox.get_sandbox_dataset_id(self.dataset_id) # Removing any existing datasets that might interfere with the test self.client = get_client(self.project_id) self.client.delete_dataset(f'{self.project_id}.{self.sandbox_id}', delete_contents=True, not_found_ok=True)
def main(args=None): pipeline_logging.configure(logging.DEBUG, add_console_handler=True) args = parse_args(args) query_list = create_queries(args.project_id, args.ticket_number, args.pids_project_id, args.pids_dataset_id, args.pids_table, args.dataset_list) client = bq.get_client(args.project_id) run_queries(query_list, client) LOGGER.info("Retraction complete")
def setUp(self): self.project_id = app_identity.get_application_id() # this ensures the dataset is scoped appropriately in test and also # can be dropped in teardown (tests should not delete env resources) unioned_dataset_id = os.environ.get('UNIONED_DATASET_ID') self.dataset_id = f'{unioned_dataset_id}_bq_test' self.description = f'Dataset for {__name__} integration tests' self.label_or_tag = {'test': 'bq'} self.client = bq.get_client(self.project_id) self.dataset_ref = bigquery.dataset.DatasetReference( self.project_id, self.dataset_id)
def setUp(self): self.hpo_id = 'fake' self.project_id = 'fake-project-id' self.test_project_id = app_identity.get_application_id() self.pid_table_id = 'pid_table' self.bq_dataset_id = os.environ.get('UNIONED_DATASET_ID') self.client = bq.get_client(self.test_project_id) self.dataset_ids = 'all_datasets' self.retraction_type = 'only_ehr' self.person_research_ids = [(1, 6890173), (2, 858761), (1234567, 4589763)]
def run_deletion(project_id, name_substrings): """ Deletes datasets from project containing any of the name_substrings :param project_id: identifies the project :param name_substrings: Identifies substrings that help identify datasets to delete :return: """ # make the developer running this script approve the environment. msg = (f'This will remove datasets from the `{project_id}` ' f'environment.\nAre you sure you want to proceed? ' f'[Y/y/N/n]: ') LOGGER.info(msg) proceed = input(msg) LOGGER.info(f'User entered: "{proceed}"') if proceed.lower() != 'y': LOGGER.info(f'User requested to exit the deletion script.\n' f'Exiting clean_project_datasets script now.') return LOGGER.info('Continuing with dataset deletions...') client = bq.get_client(project_id) all_datasets = [ dataset.dataset_id for dataset in list(client.list_datasets()) ] datasets_with_substrings = [ dataset for dataset in all_datasets for substring in name_substrings if substring in dataset ] LOGGER.info(f'{len(datasets_with_substrings)} Datasets marked for ' f'deletion in project `{project_id}`: ') for dataset in datasets_with_substrings: LOGGER.info(f'\t{dataset}') msg = (f'After reviewing datasets, proceed?\nYou will need to review ' f'the log file if you are not printing to the console.\nThis action ' f'cannot be reversed.\n' f'[Y/y/N/n]: ') LOGGER.info(msg) response = input(msg) if response.lower() == 'y': _delete_datasets(client, datasets_with_substrings) else: LOGGER.info("Proper consent was not given. Aborting deletion.") LOGGER.info("Dataset deletion completed.")
def main(args=None): pipeline_logging.configure(logging.DEBUG, add_console_handler=True) parser = get_parser() args = parser.parse_args(args) client = bq.get_client(args.project_id) dataset_ids = ru.get_datasets_list(args.project_id, args.dataset_ids) LOGGER.info( f"Datasets to retract deactivated participants from: {dataset_ids}") run_deactivation(client, args.project_id, dataset_ids, args.fq_deact_table, args.fq_pid_rid_table) LOGGER.info( f"Retraction of deactivated participants from {dataset_ids} complete")
def setUp(self): self.project_id = app_identity.get_application_id() if 'test' not in self.project_id: raise RuntimeError( f"Make sure the project_id is set to test. Project_id is {self.project_id}" ) self.dataset_id = os.environ.get('UNIONED_DATASET_ID') self.deact_dataset_id = os.environ.get('COMBINED_DATASET_ID') self.client = bq.get_client(self.project_id) self.bq_sandbox_dataset_id = sb.get_sandbox_dataset_id(self.dataset_id) self.tables = {**TABLE_ROWS, **MAPPING_TABLE_ROWS, **EXT_TABLE_ROWS} self.setup_data()
def main(input_dataset_id, output_dataset_id, project_id, hpo_ids_ex=None): """ Create a new CDM which is the union of all EHR datasets submitted by HPOs :param input_dataset_id identifies a dataset containing multiple CDMs, one for each HPO submission :param output_dataset_id identifies the dataset to store the new CDM in :param project_id: project containing the datasets :param hpo_ids_ex: (optional) list that identifies HPOs not to process, by default process all :returns: list of tables generated successfully """ client = get_client(project_id) logging.info('EHR union started') # Get all hpo_ids. hpo_ids = [item['hpo_id'] for item in bq_utils.get_hpo_info()] if hpo_ids_ex: hpo_ids = [hpo_id for hpo_id in hpo_ids if hpo_id not in hpo_ids_ex] # Create empty output tables to ensure proper schema, clustering, etc. for table in resources.CDM_TABLES: result_table = output_table_for(table) logging.info(f'Creating {output_dataset_id}.{result_table}...') bq_utils.create_standard_table(table, result_table, drop_existing=True, dataset_id=output_dataset_id) # Create mapping tables for domain_table in cdm.tables_to_map(): logging.info(f'Mapping {domain_table}...') mapping(domain_table, hpo_ids, input_dataset_id, output_dataset_id, project_id, client) # Load all tables with union of submitted tables for table_name in resources.CDM_TABLES: logging.info(f'Creating union of table {table_name}...') load(table_name, hpo_ids, input_dataset_id, output_dataset_id) logging.info('Creation of Unioned EHR complete') # create person mapping table domain_table = common.PERSON logging.info(f'Mapping {domain_table}...') mapping(domain_table, hpo_ids, input_dataset_id, output_dataset_id, project_id, client) logging.info('Starting process for Person to Observation') # Map and move EHR person records into four rows in observation, one each for race, ethnicity, dob and gender map_ehr_person_to_observation(output_dataset_id) move_ehr_person_to_observation(output_dataset_id) logging.info('Completed Person to Observation')
def main(project_id, rdr_project_id, org_id=None, hpo_id=None, dataset_id=DRC_OPS): #Get list of hpos LOGGER.info('Getting hpo list...') if org_id: hpo_list = [{"hpo_id": hpo_id, "org_id": org_id}] else: hpo_list = get_hpo_info(project_id) LOGGER.info(hpo_list) for hpo in hpo_list: org_id = hpo['org_id'] hpo_id = hpo['hpo_id'] # Get participant summary data LOGGER.info(f'Getting participant summary data for {org_id}...') participant_info = get_org_participant_information( rdr_project_id, org_id) # Load schema and create ingestion time-partitioned table schema = bq.get_table_schema(PS_API_VALUES) tablename = f'{PS_API_VALUES}_{hpo_id}' client = bq.get_client(project_id) try: table = client.get_table(f'{project_id}.{dataset_id}.{tablename}') except NotFound: LOGGER.info( f'Creating table {project_id}.{dataset_id}.{tablename}...') table = bigquery.Table(f'{project_id}.{dataset_id}.{tablename}', schema=schema) table.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.HOUR) table = client.create_table(table) # Insert summary data into table LOGGER.info( f'Storing participant data for {org_id} in table {project_id}.{dataset_id}.{tablename}...' ) store_participant_data(participant_info, project_id, f'{dataset_id}.{tablename}', schema=schema) LOGGER.info(f'Done.')
def store_digital_health_status_data(project_id, json_data, destination_table, schema=None): """ Stores the fetched digital_health_sharing_status data in a BigQuery dataset. If the table doesn't exist, it will create that table. If the table does exist, it will create a partition in the designated table or append to the same partition. This is necessary for storing data has "RECORD" type fields which do not conform to a dataframe. The data is stored using a JSON file object since it is one of the ways BigQuery expects it. :param project_id: identifies the project :param json_data: list of json objects retrieved from process_digital_health_data_to_json :param destination_table: fully qualified destination table name as 'project.dataset.table' :param schema: a list of SchemaField objects corresponding to the destination table :return: returns the bq job_id for the loading of digital health data """ # Parameter check if not isinstance(project_id, str): raise RuntimeError( f'Please specify the project in which to create the table') client = get_client(project_id) if not schema: schema = get_table_schema(DIGITAL_HEALTH_SHARING_STATUS) try: table = client.get_table(destination_table) except NotFound: table = Table(destination_table, schema=schema) table.time_partitioning = TimePartitioning( type_=TimePartitioningType.DAY) table = client.create_table(table) file_obj = StringIO() for json_obj in json_data: json.dump(json_obj, file_obj) file_obj.write('\n') job_config = LoadJobConfig( source_format=SourceFormat.NEWLINE_DELIMITED_JSON, schema=schema) job = client.load_table_from_file(file_obj, table, rewind=True, job_config=job_config, job_id_prefix='ps_digital_health_load_') job.result() return job.job_id
def setUpClass(cls): print('**************************************************************') print(cls.__name__) print('**************************************************************') cls.project_id = os.environ.get(PROJECT_ID) cls.dataset_id = os.environ.get('COMBINED_DATASET_ID') cls.client = get_client(cls.project_id) cls.hpo_id = 'fake_hpo' cls.org_id = 'fake_org' cls.ps_api_table = f'{PS_API_VALUES}_{cls.hpo_id}' cls.fq_table_names = [ f'{cls.project_id}.{cls.dataset_id}.{cls.ps_api_table}' ]
def check_and_create_sandbox_dataset(project_id, dataset_id): """ A helper function to check if sandbox dataset exisits. If it does not, it will create. :param project_id: the project_id that the dataset is in :param dataset_id: the dataset_id to verify :return: the sandbox dataset_name that either exists or was created """ client = get_client(project_id) sandbox_dataset = get_sandbox_dataset_id(dataset_id) dataset_objs = list(client.list_datasets(project_id)) datasets = [d.dataset_id for d in dataset_objs] if sandbox_dataset not in datasets: create_sandbox_dataset(project_id, dataset_id) return sandbox_dataset
def setUp(self): self.project_id = app_identity.get_application_id() self.dataset_id = os.environ.get('UNIONED_DATASET_ID') self.fq_person = f'{self.project_id}.{self.dataset_id}.person' self.fq_person_ext = f'{self.project_id}.{self.dataset_id}.person_ext' self.table_ids = [self.fq_person, self.fq_person_ext] self.person = Table.from_string(self.fq_person) self.person_ext = Table.from_string(self.fq_person_ext) self.client = bq.get_client(self.project_id) for table in self.table_ids: self.client.delete_table(table, not_found_ok=True) bq.create_tables(self.client, self.project_id, self.table_ids) self.populate_tables()
def setUp(self): self.project_id = os.environ.get(PROJECT_ID) self.dataset_id = os.environ.get('COMBINED_DATASET_ID') self.dataset_ref = DatasetReference(self.project_id, self.dataset_id) self.client = bq.get_client(self.project_id) self.schema = [ SchemaField("person_id", "INT64"), SchemaField("first_name", "STRING"), SchemaField("last_name", "STRING"), SchemaField("algorithm", "STRING") ] self.ps_api_fields = [ dict(name='person_id', type='integer', mode='nullable'), dict(name='first_name', type='string', mode='nullable'), dict(name='last_name', type='string', mode='nullable') ] self.id_match_fields = [ dict(name='person_id', type='integer', mode='nullable'), dict(name='first_name', type='string', mode='nullable'), dict(name='last_name', type='string', mode='nullable'), dict(name='algorithm', type='string', mode='nullable') ] self.hpo_id = 'fake_site' self.id_match_table_id = f'{IDENTITY_MATCH_TABLE}_{self.hpo_id}' self.ps_values_table_id = f'ps_api_values_{self.hpo_id}' # Create and populate the ps_values site table schema = bq.get_table_schema(PS_API_VALUES) tablename = self.ps_values_table_id table = Table(f'{self.project_id}.{self.dataset_id}.{tablename}', schema=schema) table.time_partitioning = TimePartitioning( type_=TimePartitioningType.HOUR) table = self.client.create_table(table) populate_query = POPULATE_PS_VALUES.render( project_id=self.project_id, drc_dataset_id=self.dataset_id, ps_values_table_id=self.ps_values_table_id) job = self.client.query(populate_query) job.result()
def main(): parser = get_arg_parser() args = parser.parse_args() #Set up pipeline logging pipeline_logging.configure(level=logging.DEBUG, add_console_handler=True) # get credentials and create client impersonation_creds = auth.get_impersonation_credentials( args.run_as_email, SCOPES) client = bq.get_client(args.project_id, credentials=impersonation_creds) # Populates the validation table for the site identify_rdr_ehr_match(client, args.project_id, args.hpo_id, EHR_OPS) LOGGER.info('Done.')
def get_datasets_list(project_id, dataset_ids_list): """ Returns list of dataset_ids on which to perform retraction Returns list of rdr, ehr, unioned, combined and deid dataset_ids and excludes sandbox and staging datasets :param project_id: identifies the project containing datasets to retract from :param dataset_ids_list: string of datasets to retract from separated by a space. If set to 'all_datasets', retracts from all datasets. If set to 'none', skips retraction from BigQuery datasets :return: List of dataset_ids :raises: AttributeError if dataset_ids_str does not allow .split() """ client = bq.get_client(project_id) all_dataset_ids = [ dataset.dataset_id for dataset in list(client.list_datasets(project_id)) ] if not dataset_ids_list or dataset_ids_list == [consts.NONE]: dataset_ids = [] LOGGER.info( "No datasets specified. Defaulting to empty list. Expect bucket only retraction." ) elif dataset_ids_list == [consts.ALL_DATASETS]: dataset_ids = all_dataset_ids LOGGER.info( f"All datasets are specified. Setting dataset_ids to all datasets in project: {project_id}" ) else: # only consider datasets that exist in the project dataset_ids = [ dataset_id for dataset_id in dataset_ids_list if dataset_id in all_dataset_ids ] LOGGER.info( f"Datasets specified and existing in project {project_id}: {dataset_ids}" ) # consider datasets containing PPI/EHR data, excluding sandbox/staging datasets dataset_ids = [ dataset_id for dataset_id in dataset_ids if get_dataset_type(dataset_id) != common.OTHER and not is_sandbox_dataset(dataset_id) ] LOGGER.info(f"Found datasets to retract from: {', '.join(dataset_ids)}") return dataset_ids
def main(raw_args=None): """ Truncate and store fitbit data. Assumes you are passing arguments either via command line or a list. """ parser = get_fitbit_parser() args, kwargs = clean_cdr.fetch_args_kwargs(parser, raw_args) pipeline_logging.configure(level=logging.INFO, add_console_handler=args.console_log) # Identify the cleaning classes being run for specified data_stage # and validate if all the required arguments are supplied cleaning_classes = clean_cdr.DATA_STAGE_RULES_MAPPING[consts.FITBIT] clean_cdr.validate_custom_params(cleaning_classes, **kwargs) # get credentials and create client impersonation_creds = auth.get_impersonation_credentials( args.run_as_email, SCOPES) client = bq.get_client(args.project_id, credentials=impersonation_creds) # create staging, sandbox, backup and clean datasets with descriptions and labels fitbit_datasets = create_fitbit_datasets(client, args.release_tag) copy_fitbit_tables_from_views(client, args.fitbit_dataset, fitbit_datasets[consts.BACKUP], table_prefix='v_') bq.copy_datasets(client, fitbit_datasets[consts.BACKUP], fitbit_datasets[consts.STAGING]) common_cleaning_args = [ '-p', args.project_id, '-d', fitbit_datasets[consts.STAGING], '-b', fitbit_datasets[consts.SANDBOX], '-s', '-a', consts.FITBIT ] fitbit_cleaning_args = args_parser.add_kwargs_to_args( common_cleaning_args, kwargs) clean_cdr.main(args=fitbit_cleaning_args) # Snapshot the staging dataset to final dataset bq.build_and_copy_contents(client, fitbit_datasets[consts.STAGING], fitbit_datasets[consts.CLEAN])
def load_test_data(self, df, project_id, dataset_id, table): """ Add data to the tables for the rule to run on. :param df: a dataframe containing data to insert :param project_id :param dataset_id :param table """ client = get_client(project_id) schema = get_table_schema(table) schema = [field for field in schema if field.name in list(df.columns)] load_job_config = LoadJobConfig(schema=schema) load_job = client.load_table_from_dataframe(df, f'{dataset_id}.{table}', job_config=load_job_config) load_job.result()
def count_pid_rows_in_dataset(project_id, dataset_id, hpo_id, pid_source): """ Returns df containing tables and counts of participant rows for pids in pids_source :param project_id: identifies the project :param dataset_id: identifies the dataset :param hpo_id: Identifies the hpo site that submitted the pids :param pid_source: string containing query or list containing pids :return: df with headers table_id, all_counts, all_ehr_counts, and map_ehr_counts """ dataset_type = ru.get_dataset_type(dataset_id) counts_df = pd.DataFrame(columns=[ ru_consts.TABLE_ID, consts.ALL_COUNT, consts.ALL_EHR_COUNT, consts.MAP_EHR_COUNT ]) bq_client = bq.get_client(project_id) cols_query = bq.dataset_columns_query(project_id, dataset_id) table_df = bq_client.query(cols_query).to_dataframe() if dataset_type == common.COMBINED: query = get_combined_deid_query(project_id, dataset_id, pid_source, table_df) elif dataset_type == common.DEID or dataset_type == common.RELEASE: query = get_combined_deid_query(project_id, dataset_id, pid_source, table_df, for_deid=True) elif dataset_type == common.EHR: query = get_ehr_query(project_id, dataset_id, pid_source, hpo_id, table_df) elif dataset_type == common.RDR: query = get_dataset_query(project_id, dataset_id, pid_source, table_df, for_rdr=True) else: query = get_dataset_query(project_id, dataset_id, pid_source, table_df) if query: counts_df = bq.query(query, project_id) # sort by count desc counts_df = counts_df.sort_values(by=consts.ALL_COUNT, ascending=False) return counts_df
def setUp(self): self.hpo_id = 'fake' self.project_id = app_identity.get_application_id() if 'test' not in self.project_id: raise RuntimeError( f"Make sure the project_id is set to test. project_id is {self.project_id}" ) self.bq_dataset_id = bq_utils.get_dataset_id() self.bq_sandbox_dataset_id = get_sandbox_dataset_id(self.bq_dataset_id) self.ticket_number = 'DCXXX' self.pid_table_id = 'pid_table' self.pid_table_id_list = [ self.project_id + '.' + self.bq_dataset_id + '.' + 'pid_table' ] self.deactivated_ehr_participants = [(1, '2010-01-01'), (2, '2010-01-01'), (5, '2010-01-01')] self.client = bq.get_client(self.project_id)
def setUp(self): self.hpo_bucket = gcs_utils.get_hpo_bucket(FAKE_HPO_ID) self.project_id = app_identity.get_application_id() self.dataset_id = bq_utils.get_dataset_id() self.rdr_dataset_id = bq_utils.get_rdr_dataset_id() self.folder_prefix = '2019-01-01/' test_util.delete_all_tables(self.dataset_id) test_util.empty_bucket(self.hpo_bucket) self.client = bq.get_client(self.project_id) mock_get_hpo_name = mock.patch('validation.main.get_hpo_name') self.mock_get_hpo_name = mock_get_hpo_name.start() self.mock_get_hpo_name.return_value = 'Fake HPO' self.addCleanup(mock_get_hpo_name.stop) self._load_data()
def main(): parser = get_arg_parser() args = parser.parse_args() # get credentials and create client impersonation_creds = auth.get_impersonation_credentials( args.run_as_email, SCOPES) client = bq.get_client(args.project_id, credentials=impersonation_creds) table_id = f'{IDENTITY_MATCH_TABLE}_{args.hpo_id}' # Creates hpo_site identity match table if it does not exist if not table_exists(table_id, DRC_OPS): create_drc_validation_table(client, args.project_id, table_id) # Populates the validation table for the site populate_validation_table(client, args.project_id, table_id, args.hpo_id)
def load_deid_map_table(deid_map_dataset_name, age_limit): # Create _deid_map table in input dataset project_id = app_identity.get_application_id() client = bq.get_client(project_id) deid_map_table = f'{project_id}.{deid_map_dataset_name}.{DEID_MAP_TABLE}' # Copy master _deid_map table records to _deid_map table if bq_utils.table_exists(DEID_MAP_TABLE, dataset_id=PIPELINE_TABLES_DATASET): copy_deid_map_table(deid_map_table, project_id, PIPELINE_TABLES_DATASET, deid_map_dataset_name, age_limit, client) logging.info( f"copied participants younger than {age_limit} to the table {deid_map_table}" ) else: raise RuntimeError( f'{DEID_MAP_TABLE} is not available in {project_id}.{PIPELINE_TABLES_DATASET}' )
def setUp(self): self.project_id = app_identity.get_application_id() self.dataset_id = bq_utils.get_dataset_id() self.sandbox_id = check_and_create_sandbox_dataset( self.project_id, self.dataset_id) self.tablename = '_deactivated_participants' self.ticket_number = 'DC12345' self.columns = ['participantId', 'suspensionStatus', 'suspensionTime'] self.deactivated_participants = [(1, 'NO_CONTACT', '2018-12-07'), (2, 'NO_CONTACT', '2019-12-07'), (3, 'NO_CONTACT', '2017-12-07')] self.json_response_entry = { 'entry': [{ 'fullUrl': 'https//foo_project.appspot.com/rdr/v1/Participant/P1/Summary', 'resource': { 'participantId': 'P1', 'suspensionStatus': 'NO_CONTACT', 'suspensionTime': '2018-12-07T08:21:14' } }, { 'fullUrl': 'https//foo_project.appspot.com/rdr/v1/Participant/P2/Summary', 'resource': { 'participantId': 'P2', 'suspensionStatus': 'NO_CONTACT', 'suspensionTime': '2019-12-07T08:21:14' } }, { 'fullUrl': 'https//foo_project.appspot.com/rdr/v1/Participant/P3/Summary', 'resource': { 'participantId': 'P3', 'suspensionStatus': 'NO_CONTACT', 'suspensionTime': '2017-12-07T08:21:14' } }] } self.client = bq.get_client(self.project_id)
def main(project_id, dataset_id, bucket_name, hpo_id, folder_name): """ Main function to load submission into dataset :param project_id: Identifies the project :param dataset_id: Identifies the destination dataset :param bucket_name: the bucket in GCS containing the archive files :param hpo_id: Identifies the HPO site :param folder_name: Name of the submission folder to load :return: """ bq_client = get_client(project_id) gcs_client = GCSClient(project_id) site_bucket = get_bucket(bq_client, hpo_id) prefix = f'{hpo_id}/{site_bucket}/{folder_name}' LOGGER.info( f'Starting jobs for loading {bucket_name}/{prefix} into {dataset_id}') _ = load_folder(dataset_id, bq_client, bucket_name, prefix, gcs_client, hpo_id) LOGGER.info(f'Successfully loaded {bucket_name}/{prefix} into {dataset_id}')
def main(project_id: str, bucket_name: str, vocab_folder_path: str, dst_dataset_id: str): """ Load and transform vocabulary files in GCS to a BigQuery dataset :param project_id: Identifies the BQ project :param bucket_name: refers to the bucket containing vocabulary files :param vocab_folder_path: points to the directory containing files downloaded from athena with CPT4 applied :param dst_dataset_id: final destination to load the vocabulary in BigQuery """ bq_client = bq.get_client(project_id) gcs_client = storage.Client(project_id) vocab_folder_path = Path(vocab_folder_path) update_aou_vocabs(vocab_folder_path) upload_stage(bucket_name, vocab_folder_path, gcs_client) staging_dataset = check_and_create_staging_dataset(dst_dataset_id, bucket_name, bq_client) load_stage(staging_dataset, bq_client, bucket_name, gcs_client) load(project_id, bq_client, staging_dataset.dataset_id, dst_dataset_id) return
def main(first_n): pipeline_logging.configure(logging.INFO, add_console_handler=True) bq_client = bq.get_client(os.environ.get('GOOGLE_CLOUD_PROJECT')) _check_project(bq_client) datasets_to_delete = _filter_stale_datasets(bq_client, first_n) for stale_dataset in datasets_to_delete: LOGGER.info(f"Running - bq_client.delete_dataset({stale_dataset})") try: bq_client.delete_dataset(stale_dataset) except exceptions.BadRequest as e: LOGGER.warning( f"Failed to delete {stale_dataset}. Message: {e.message}") return datasets_to_delete
def copy_tables_to_new_dataset(project_id, dataset_id, snapshot_dataset_id): """ lists the tables in the dataset and copies each table to a new dataset. :param dataset_id: :param project_id: :param snapshot_dataset_id: :return: """ copy_table_job_ids = [] client = bq.get_client(project_id) for table_id in list_all_table_ids(dataset_id): q = get_copy_table_query(project_id, dataset_id, table_id, client) results = query(q, use_legacy_sql=False, destination_table_id=table_id, destination_dataset_id=snapshot_dataset_id, batch=True) copy_table_job_ids.append(results['jobReference']['jobId']) incomplete_jobs = wait_on_jobs(copy_table_job_ids) if len(incomplete_jobs) > 0: raise BigQueryJobWaitError(incomplete_jobs)
def get_domain_mapping_queries(project_id, dataset_id): """ This function generates a list of query dicts for creating id mappings in _logging_domain_alignment. The list will get consumed clean_engine :param project_id: the project_id in which the query is run :param dataset_id: the dataset_id in which the query is run :return: a list of query dicts for creating id mappings in _logging_domain_alignment """ # Create _logging_domain_alignment client = bq.get_client(project_id) table_id = f'{project_id}.{dataset_id}.{DOMAIN_ALIGNMENT_TABLE_NAME}' client.delete_table(table_id, not_found_ok=True) bq.create_tables(client, project_id, [table_id], exists_ok=False) domain_mapping_queries = [] for domain_table in domain_mapping.DOMAIN_TABLE_NAMES: query = parse_domain_mapping_query_cross_domain( project_id, dataset_id, domain_table) domain_mapping_queries.append(query) # Create the query for creating field_mappings for the records moving between the same domain query = parse_domain_mapping_query_for_same_domains(project_id, dataset_id) domain_mapping_queries.append(query) # Create the query for the records that are in the wrong domain but will not be moved query = parse_domain_mapping_query_for_excluded_records( project_id, dataset_id) domain_mapping_queries.append(query) unioned_query = { cdr_consts.QUERY: UNION_ALL.join(domain_mapping_queries), cdr_consts.DESTINATION_TABLE: DOMAIN_ALIGNMENT_TABLE_NAME, cdr_consts.DISPOSITION: bq_consts.WRITE_EMPTY, cdr_consts.DESTINATION_DATASET: dataset_id } return [unioned_query]
def clean_dataset(project_id, dataset_id, sandbox_dataset_id, rules, table_namer='', **kwargs): """ Run the assigned cleaning rules and return list of BQ job objects :param project_id: identifies the project :param dataset_id: identifies the dataset to clean :param sandbox_dataset_id: identifies the sandbox dataset to store backup rows :param rules: a list of cleaning rule objects/functions as tuples :param table_namer: source differentiator value expected to be the same for all rules run on the same dataset :param kwargs: keyword arguments a cleaning rule may require :return all_jobs: List of BigQuery job objects """ # Set up client client = bq.get_client(project_id=project_id) all_jobs = [] for rule_index, rule in enumerate(rules): clazz = rule[0] query_function, setup_function, rule_info = infer_rule( clazz, project_id, dataset_id, sandbox_dataset_id, table_namer, **kwargs) LOGGER.info( f"Applying cleaning rule {rule_info[cdr_consts.MODULE_NAME]} " f"{rule_index+1}/{len(rules)}") setup_function(client) query_list = query_function() jobs = run_queries(client, query_list, rule_info) LOGGER.info( f"For clean rule {rule_info[cdr_consts.MODULE_NAME]}, {len(jobs)} jobs " f"were run successfully for {len(query_list)} queries") all_jobs.extend(jobs) return all_jobs