def clean_combined_de_identified_clean_dataset(project_id=None, dataset_id=None): """ Run all clean rules defined for the deidentified ehr and rdr clean dataset. :param project_id: Name of the BigQuery project. :param dataset_id: Name of the dataset to clean """ if project_id is None: project_id = app_identity.get_application_id() LOGGER.info('Project is unspecified. Using default value of:\t%s', project_id) if dataset_id is None: dataset_id = bq_utils.get_combined_deid_clean_dataset_id() LOGGER.info('Dataset is unspecified. Using default value of:\t%s', dataset_id) sandbox_dataset_id = sandbox.create_sandbox_dataset(project_id=project_id, dataset_id=dataset_id) query_list = _gather_combined_de_identified_clean_queries( project_id, dataset_id, sandbox_dataset_id) LOGGER.info("Cleaning de-identified dataset") clean_engine.clean_dataset(project_id, query_list, stage.DEID_CLEAN)
def test_create_sandbox_dataset(self): # Create sandbox dataset sandbox_dataset = sandbox.create_sandbox_dataset( self.project_id, self.dataset_id) all_datasets_obj = list_datasets(self.project_id) all_datasets = [d.dataset_id for d in all_datasets_obj] self.assertTrue(sandbox_dataset in all_datasets) # Try to create same sandbox, which now already exists self.assertRaises(RuntimeError, sandbox.create_sandbox_dataset, self.project_id, self.dataset_id) # Remove fake dataset created in project delete_dataset(self.project_id, sandbox_dataset)
sandbox_dataset=sandbox_dataset, intermediary_table=INTERMEDIARY_TABLE) queries_list.append(invalid_measurements_query) valid_measurements_query = dict() valid_measurements_query[cdr_consts.QUERY] = VALID_MEASUREMENTS.format( dataset=dataset_id, project=project_id, sandbox_dataset=sandbox_dataset, intermediary_table=INTERMEDIARY_TABLE) queries_list.append(valid_measurements_query) return queries_list if __name__ == '__main__': import cdr_cleaner.args_parser as parser import cdr_cleaner.clean_cdr_engine as clean_engine import sandbox ARGS = parser.parse_args() # Uncomment these lines if running locally sandbox_dataset_id = sandbox.create_sandbox_dataset( project_id=ARGS.project_id, dataset_id=ARGS.dataset_id) clean_engine.add_console_logging(ARGS.console_log) query_list = get_drop_multiple_measurement_queries(ARGS.project_id, ARGS.dataset_id, sandbox_dataset_id) clean_engine.clean_dataset(ARGS.project_id, query_list)
def setUp(self): self.project_id = app_identity.get_application_id() self.dataset_id = bq_utils.get_rdr_dataset_id() self.sandbox_dataset_id = sandbox.get_sandbox_dataset_id( self.dataset_id) sandbox.create_sandbox_dataset(self.project_id, self.dataset_id)