def test_create_sandbox_dataset(self): # Create sandbox dataset sandbox_dataset = sandbox.create_sandbox_dataset( self.project_id, self.dataset_id) all_datasets_obj = list_datasets(self.project_id) all_datasets = [d.dataset_id for d in all_datasets_obj] self.assertTrue(sandbox_dataset in all_datasets) # Try to create same sandbox, which now already exists self.assertRaises(RuntimeError, sandbox.create_sandbox_dataset, self.project_id, self.dataset_id) # Remove fake dataset created in project delete_dataset(self.project_id, sandbox_dataset)
def check_and_create_sandbox_dataset(project_id, dataset_id): """ A helper function to check if sandbox dataset exisits. If it does not, it will create. :param project_id: the project_id that the dataset is in :param dataset_id: the dataset_id to verify :return: the sandbox dataset_name that either exists or was created """ sandbox_dataset = get_sandbox_dataset_id(dataset_id) dataset_objs = list_datasets(project_id) datasets = [d.dataset_id for d in dataset_objs] if sandbox_dataset not in datasets: create_sandbox_dataset(project_id, dataset_id) return sandbox_dataset
def get_datasets_list(project_id, dataset_ids_str): """ Returns list of dataset_ids on which to perform retraction Returns list of rdr, ehr, unioned, combined and deid dataset_ids and excludes sandbox and staging datasets :param project_id: identifies the project containing datasets to retract from :param dataset_ids_str: string of datasets to retract from separated by a space. If set to 'all_datasets', retracts from all datasets. If set to 'none', skips retraction from BigQuery datasets :return: List of dataset_ids :raises: AttributeError if dataset_ids_str does not allow .split() """ all_dataset_ids = [ dataset.dataset_id for dataset in bq.list_datasets(project_id) ] if not dataset_ids_str or dataset_ids_str == consts.NONE: dataset_ids = [] LOGGER.info( "No datasets specified. Defaulting to empty list. Expect bucked only retraction." ) elif dataset_ids_str == consts.ALL_DATASETS: dataset_ids = all_dataset_ids LOGGER.info( f"All datasets are specified. Setting dataset_ids to all datasets in project: {project_id}" ) else: dataset_ids = dataset_ids_str.split() # only consider datasets that exist in the project dataset_ids = [ dataset_id for dataset_id in dataset_ids if dataset_id in all_dataset_ids ] LOGGER.info( f"Datasets specified and existing in project {project_id}: {dataset_ids}" ) # consider datasets containing PPI/EHR data, excluding sandbox/staging datasets dataset_ids = [ dataset_id for dataset_id in dataset_ids if get_dataset_type(dataset_id) != common.OTHER and not is_sandbox_dataset(dataset_id) and not is_staging_dataset(dataset_id) ] LOGGER.info(f"Found datasets to retract from: {', '.join(dataset_ids)}") return dataset_ids
def get_dataset_ids_to_target(project_id, dataset_ids=None): """ Return dataset_ids that are found in the project based on BQ metadata :param project_id: Identifies the project to target :param dataset_ids: list identifying datasets or None for all datasets :return: List of dataset_ids in the project to target """ all_datasets = bq.list_datasets(project_id) all_dataset_ids = [dataset.dataset_id for dataset in all_datasets] result_dataset_ids = [] if dataset_ids is None: result_dataset_ids = all_dataset_ids else: for dataset_id in dataset_ids: if dataset_id not in all_dataset_ids: logging.info( f"Dataset {dataset_id} not found in project {project_id}, skipping" ) else: result_dataset_ids.append(dataset_id) return result_dataset_ids