Ejemplo n.º 1
0
def get_sandbox_queries(project_id, dataset_id, pids, ticket_number):
    """
    Returns a list of queries of all tables to be added to the datasets sandbox. These tables include all rows from all
    effected tables that include PIDs that will be removed by a specific cleaning rule.

    :param project_id: bq project_id
    :param dataset_id: bq dataset_id
    :param pids: list of person_ids from cleaning rule that need to be sandboxed and removed
    :param ticket_number: ticket number from jira that will be appended to the end of the sandbox table names
    :return: list of CREATE OR REPLACE queries to create tables in sandbox
    """
    person_tables_list = get_tables_with_person_id(project_id, dataset_id)
    queries_list = []

    for table in person_tables_list:
        sandbox_queries = dict()
        sandbox_queries[cdr_consts.QUERY] = SANDBOX_QUERY.format(
            dataset=dataset_id,
            project=project_id,
            table=table,
            sandbox_dataset=get_sandbox_dataset_id(dataset_id),
            intermediary_table=table + '_' + ticket_number,
            # need to convert list of pids to string of pids
            pids=','.join([str(i) for i in pids]))
        queries_list.append(sandbox_queries)

    return queries_list
Ejemplo n.º 2
0
def main(project_id: str, bucket_name: str, dst_dataset_id: str):
    """
    Load and transform vocabulary files in GCS to a BigQuery dataset

    :param project_id:
    :param bucket_name: refers to the bucket containing vocabulary files
    :param dst_dataset_id: final destination to load the vocabulary in BigQuery
    """
    bq_client = bq.get_client(project_id)
    gcs_client = storage.Client(project_id)
    sandbox_dataset_id = get_sandbox_dataset_id(dst_dataset_id)
    sandbox_dataset = bq.create_dataset(
        project_id,
        sandbox_dataset_id,
        f'Vocabulary loaded from gs://{bucket_name}',
        label_or_tag={'type': 'vocabulary'},
        overwrite_existing=True)
    stage_jobs = load_stage(sandbox_dataset, bq_client, bucket_name, gcs_client)
    wait_jobs(stage_jobs)
    load_jobs = load(project_id,
                     bq_client,
                     sandbox_dataset_id,
                     dst_dataset_id,
                     overwrite_ok=True)
    wait_jobs(load_jobs)
 def setUp(self):
     self.project_id = app_identity.get_application_id()
     self.dataset_id = bq_utils.get_rdr_dataset_id()
     self.sandbox_dataset_id = sandbox.get_sandbox_dataset_id(
         self.dataset_id)
     sandbox.check_and_create_sandbox_dataset(self.project_id,
                                              self.dataset_id)
Ejemplo n.º 4
0
def get_remove_operational_pii_fields_query(project_id, dataset_id,
                                            sandbox_dataset_id):
    """

    :param project_id: Name of the project
    :param dataset_id: Name of the dataset where the queries should be run
    :param sandbox_dataset_id: Name of the sandbox dataset
    :return:
    """
    # fetch sandbox_dataset_id
    if sandbox_dataset_id is None:
        sandbox_dataset_id = sandbox.get_sandbox_dataset_id(dataset_id)

    load_operational_pii_fields_lookup_table(
        project_id=project_id, sandbox_dataset_id=sandbox_dataset_id)

    queries_list = []

    # Save operational pii records being deleted in sandbox `dataset.intermediary_table` .
    query = dict()
    query[cdr_consts.QUERY] = _get_intermediary_table_query(
        dataset_id, project_id, sandbox_dataset_id)

    queries_list.append(query)

    # Delete operational pii records from observation table
    query = dict()

    query[cdr_consts.QUERY] = _get_delete_query(dataset_id, project_id,
                                                sandbox_dataset_id)
    queries_list.append(query)

    return queries_list
Ejemplo n.º 5
0
 def setUp(self):
     self.project_id = app_identity.get_application_id()
     self.dataset_id = os.environ.get('UNIONED_DATASET_ID')
     self.sandbox_id = sandbox.get_sandbox_dataset_id(self.dataset_id)
     # Removing any existing datasets that might interfere with the test
     self.client = get_client(self.project_id)
     self.client.delete_dataset(f'{self.project_id}.{self.sandbox_id}',
                                delete_contents=True,
                                not_found_ok=True)
Ejemplo n.º 6
0
def get_remove_invalid_procedure_source_queries(project_id,
                                                dataset_id,
                                                sandbox_dataset_id=None):
    """
    runs the query which removes records that contain incorrect values in the procedure_source_concept_id field
    invalid procedure_source_concept_ids are where it is not in the procedure domain and
    procedure_concept_id is not standard in the procedure domain

    :param project_id: Name of the project
    :param dataset_id: Name of the dataset where the queries should be run
    :param sandbox_dataset_id: Identifies the sandbox dataset to store rows 
    #TODO use sandbox_dataset_id for CR
    :return:
    """
    queries_list = []

    # queries to sandbox
    invalid_records = dict()
    invalid_records[
        cdr_consts.QUERY] = INVALID_PROCEDURE_SOURCE_CONCEPT_IDS_QUERY.format(
            project=project_id,
            dataset=dataset_id,
            table=TABLE,
            sandbox_dataset=get_sandbox_dataset_id(dataset_id),
            intermediary_table=INTERMEDIARY_TABLE_NAME)
    queries_list.append(invalid_records)

    # queries to delete invalid procedure source records
    valid_records = VALID_PROCEDURE_SOURCE_CONCEPT_IDS_QUERY.format(
        project=project_id,
        dataset=dataset_id,
        table=TABLE,
        sandbox_dataset=get_sandbox_dataset_id(dataset_id),
        intermediary_table=INTERMEDIARY_TABLE_NAME)
    queries_list.append({
        clean_consts.QUERY: valid_records,
        clean_consts.DESTINATION_TABLE: TABLE,
        clean_consts.DESTINATION_DATASET: dataset_id,
        clean_consts.DISPOSITION: bq_consts.WRITE_TRUNCATE
    })

    return queries_list
 def setUp(self):
     self.project_id = app_identity.get_application_id()
     if 'test' not in self.project_id:
         raise RuntimeError(
             f"Make sure the project_id is set to test. Project_id is {self.project_id}"
         )
     self.dataset_id = os.environ.get('UNIONED_DATASET_ID')
     self.deact_dataset_id = os.environ.get('COMBINED_DATASET_ID')
     self.client = bq.get_client(self.project_id)
     self.bq_sandbox_dataset_id = sb.get_sandbox_dataset_id(self.dataset_id)
     self.tables = {**TABLE_ROWS, **MAPPING_TABLE_ROWS, **EXT_TABLE_ROWS}
     self.setup_data()
Ejemplo n.º 8
0
def get_queries_clean_smoking(project_id, dataset_id, sandbox_dataset_id):
    """
    Queries to run for deleting incorrect smoking rows and inserting corrected rows

    :param project_id: project id associated with the dataset to run the queries on
    :param dataset_id: dataset id to run the queries on
    :param sandbox_dataset_id: dataset id of the sandbox
    :return: list of query dicts
    """
    queries = []

    # fetch sandbox_dataset_id
    if sandbox_dataset_id is None:
        sandbox_dataset_id = sandbox.get_sandbox_dataset_id(dataset_id)

    load_smoking_lookup_table(project_id, sandbox_dataset_id)

    sandbox_query = dict()
    sandbox_query[cdr_consts.QUERY] = SANDBOX_CREATE_QUERY.format(
        project_id=project_id,
        dataset_id=dataset_id,
        sandbox_dataset_id=sandbox_dataset_id,
        new_smoking_rows=NEW_SMOKING_ROWS,
        smoking_lookup_table=SMOKING_LOOKUP_TABLE)
    queries.append(sandbox_query)

    delete_query = dict()
    delete_query[cdr_consts.QUERY] = DELETE_INCORRECT_RECORDS.format(
        project_id=project_id,
        dataset_id=dataset_id,
        sandbox_dataset_id=sandbox_dataset_id,
        smoking_lookup_table=SMOKING_LOOKUP_TABLE)
    queries.append(delete_query)

    insert_query = dict()
    insert_query[cdr_consts.QUERY] = INSERT_CORRECTED_RECORDS.format(
        project_id=project_id,
        dataset_id=dataset_id,
        sandbox_dataset_id=sandbox_dataset_id,
        new_smoking_rows=NEW_SMOKING_ROWS)
    queries.append(insert_query)

    return queries
Ejemplo n.º 9
0
    def setUpClass(cls):
        print('**************************************************************')
        print(cls.__name__)
        print('**************************************************************')

        super().initialize_class_vars()
        project_id = app_identity.get_application_id()
        dataset_id = bq_utils.get_rdr_dataset_id()
        sandbox_dataset_id = sandbox.get_sandbox_dataset_id(dataset_id)
        rule = PpiBranching(project_id, dataset_id, sandbox_dataset_id)
        cls.dataset_id = dataset_id
        cls.sandbox_dataset_id = sandbox_dataset_id
        cls.project_id = project_id
        cls.rule_instance = rule
        cls.fq_sandbox_table_names = [
            _fq_table_name(table)
            for table in (rule.lookup_table, rule.backup_table)
        ]
        cls.fq_table_names = [_fq_table_name(rule.observation_table)]
        super().setUpClass()
Ejemplo n.º 10
0
 def delete_sandbox(self):
     sandbox_dataset_id = sandbox.get_sandbox_dataset_id(self.dataset_id)
     self.client.delete_dataset(sandbox_dataset_id,
                                delete_contents=True,
                                not_found_ok=True)