Ejemplo n.º 1
0
def clean_combined_de_identified_clean_dataset(project_id=None,
                                               dataset_id=None):
    """
    Run all clean rules defined for the deidentified ehr and rdr clean dataset.
    :param project_id:  Name of the BigQuery project.
    :param dataset_id:  Name of the dataset to clean
    """
    if project_id is None:
        project_id = app_identity.get_application_id()
        LOGGER.info('Project is unspecified.  Using default value of:\t%s',
                    project_id)

    if dataset_id is None:
        dataset_id = bq_utils.get_combined_deid_clean_dataset_id()
        LOGGER.info('Dataset is unspecified.  Using default value of:\t%s',
                    dataset_id)

    sandbox_dataset_id = sandbox.create_sandbox_dataset(project_id=project_id,
                                                        dataset_id=dataset_id)

    query_list = _gather_combined_de_identified_clean_queries(
        project_id, dataset_id, sandbox_dataset_id)

    LOGGER.info("Cleaning de-identified dataset")
    clean_engine.clean_dataset(project_id, query_list, stage.DEID_CLEAN)
Ejemplo n.º 2
0
    def test_create_sandbox_dataset(self):
        # Create sandbox dataset
        sandbox_dataset = sandbox.create_sandbox_dataset(
            self.project_id, self.dataset_id)
        all_datasets_obj = list_datasets(self.project_id)
        all_datasets = [d.dataset_id for d in all_datasets_obj]

        self.assertTrue(sandbox_dataset in all_datasets)

        # Try to create same sandbox, which now already exists
        self.assertRaises(RuntimeError, sandbox.create_sandbox_dataset,
                          self.project_id, self.dataset_id)

        # Remove fake dataset created in project
        delete_dataset(self.project_id, sandbox_dataset)
Ejemplo n.º 3
0
            sandbox_dataset=sandbox_dataset,
            intermediary_table=INTERMEDIARY_TABLE)
    queries_list.append(invalid_measurements_query)

    valid_measurements_query = dict()
    valid_measurements_query[cdr_consts.QUERY] = VALID_MEASUREMENTS.format(
        dataset=dataset_id,
        project=project_id,
        sandbox_dataset=sandbox_dataset,
        intermediary_table=INTERMEDIARY_TABLE)
    queries_list.append(valid_measurements_query)

    return queries_list


if __name__ == '__main__':
    import cdr_cleaner.args_parser as parser
    import cdr_cleaner.clean_cdr_engine as clean_engine
    import sandbox

    ARGS = parser.parse_args()

    # Uncomment these lines if running locally
    sandbox_dataset_id = sandbox.create_sandbox_dataset(
        project_id=ARGS.project_id, dataset_id=ARGS.dataset_id)
    clean_engine.add_console_logging(ARGS.console_log)
    query_list = get_drop_multiple_measurement_queries(ARGS.project_id,
                                                       ARGS.dataset_id,
                                                       sandbox_dataset_id)
    clean_engine.clean_dataset(ARGS.project_id, query_list)
 def setUp(self):
     self.project_id = app_identity.get_application_id()
     self.dataset_id = bq_utils.get_rdr_dataset_id()
     self.sandbox_dataset_id = sandbox.get_sandbox_dataset_id(
         self.dataset_id)
     sandbox.create_sandbox_dataset(self.project_id, self.dataset_id)