コード例 #1
0
 def setUp(self):
     self.project_id = app_identity.get_application_id()
     self.dataset_id = bq_utils.get_rdr_dataset_id()
     self.sandbox_dataset_id = sandbox.get_sandbox_dataset_id(
         self.dataset_id)
     sandbox.check_and_create_sandbox_dataset(self.project_id,
                                              self.dataset_id)
コード例 #2
0
def get_remove_operational_pii_fields_query(project_id, dataset_id,
                                            sandbox_dataset_id):
    """

    :param project_id: Name of the project
    :param dataset_id: Name of the dataset where the queries should be run
    :param sandbox_dataset_id: Name of the sandbox dataset
    :return:
    """
    # fetch sandbox_dataset_id
    if sandbox_dataset_id is None:
        sandbox_dataset_id = sandbox.get_sandbox_dataset_id(dataset_id)

    load_operational_pii_fields_lookup_table(
        project_id=project_id, sandbox_dataset_id=sandbox_dataset_id)

    queries_list = []

    # Save operational pii records being deleted in sandbox `dataset.intermediary_table` .
    query = dict()
    query[cdr_consts.QUERY] = _get_intermediary_table_query(
        dataset_id, project_id, sandbox_dataset_id)

    queries_list.append(query)

    # Delete operational pii records from observation table
    query = dict()

    query[cdr_consts.QUERY] = _get_delete_query(dataset_id, project_id,
                                                sandbox_dataset_id)
    queries_list.append(query)

    return queries_list
コード例 #3
0
def get_sandbox_queries(project_id, dataset_id, pids, ticket_number):
    """
    Returns a list of queries of all tables to be added to the datasets sandbox. These tables include all rows from all
    effected tables that include PIDs that will be removed by a specific cleaning rule.

    :param project_id: bq project_id
    :param dataset_id: bq dataset_id
    :param pids: list of person_ids from cleaning rule that need to be sandboxed and removed
    :param ticket_number: ticket number from jira that will be appended to the end of the sandbox table names
    :return: list of CREATE OR REPLACE queries to create tables in sandbox
    """
    person_tables_list = get_tables_with_person_id(project_id, dataset_id)
    queries_list = []

    for table in person_tables_list:
        sandbox_queries = dict()
        sandbox_queries[cdr_consts.QUERY] = SANDBOX_QUERY.format(
            dataset=dataset_id,
            project=project_id,
            table=table,
            sandbox_dataset=get_sandbox_dataset_id(dataset_id),
            intermediary_table=table + '_' + ticket_number,
            # need to convert list of pids to string of pids
            pids=','.join([str(i) for i in pids]))
        queries_list.append(sandbox_queries)

    return queries_list
コード例 #4
0
 def setUp(self):
     self.project_id = app_identity.get_application_id()
     self.dataset_id = os.environ.get('UNIONED_DATASET_ID')
     self.sandbox_id = sandbox.get_sandbox_dataset_id(self.dataset_id)
     # Removing any existing datasets that might interfere with the test
     self.client = get_client(self.project_id)
     self.client.delete_dataset(f'{self.project_id}.{self.sandbox_id}',
                                delete_contents=True,
                                not_found_ok=True)
コード例 #5
0
def get_remove_invalid_procedure_source_queries(project_id,
                                                dataset_id,
                                                sandbox_dataset_id=None):
    """
    runs the query which removes records that contain incorrect values in the procedure_source_concept_id field
    invalid procedure_source_concept_ids are where it is not in the procedure domain and
    procedure_concept_id is not standard in the procedure domain

    :param project_id: Name of the project
    :param dataset_id: Name of the dataset where the queries should be run
    :param sandbox_dataset_id: Identifies the sandbox dataset to store rows 
    #TODO use sandbox_dataset_id for CR
    :return:
    """
    queries_list = []

    # queries to sandbox
    invalid_records = dict()
    invalid_records[
        cdr_consts.QUERY] = INVALID_PROCEDURE_SOURCE_CONCEPT_IDS_QUERY.format(
            project=project_id,
            dataset=dataset_id,
            table=TABLE,
            sandbox_dataset=get_sandbox_dataset_id(dataset_id),
            intermediary_table=INTERMEDIARY_TABLE_NAME)
    queries_list.append(invalid_records)

    # queries to delete invalid procedure source records
    valid_records = VALID_PROCEDURE_SOURCE_CONCEPT_IDS_QUERY.format(
        project=project_id,
        dataset=dataset_id,
        table=TABLE,
        sandbox_dataset=get_sandbox_dataset_id(dataset_id),
        intermediary_table=INTERMEDIARY_TABLE_NAME)
    queries_list.append({
        clean_consts.QUERY: valid_records,
        clean_consts.DESTINATION_TABLE: TABLE,
        clean_consts.DESTINATION_DATASET: dataset_id,
        clean_consts.DISPOSITION: bq_consts.WRITE_TRUNCATE
    })

    return queries_list
コード例 #6
0
 def setUp(self):
     self.hpo_id = 'fake'
     self.project_id = app_identity.get_application_id()
     if 'test' not in self.project_id:
         raise RuntimeError(
             f"Make sure the project_id is set to test.  project_id is {self.project_id}"
         )
     self.bq_dataset_id = bq_utils.get_dataset_id()
     self.bq_sandbox_dataset_id = get_sandbox_dataset_id(self.bq_dataset_id)
     self.ticket_number = 'DCXXX'
     self.pid_table_id = 'pid_table'
     self.pid_table_id_list = [
         self.project_id + '.' + self.bq_dataset_id + '.' + 'pid_table'
     ]
     self.deactivated_ehr_participants = [(1, '2010-01-01'),
                                          (2, '2010-01-01'),
                                          (5, '2010-01-01')]
     self.client = bq.get_client(self.project_id)
コード例 #7
0
def get_queries_clean_smoking(project_id, dataset_id, sandbox_dataset_id):
    """
    Queries to run for deleting incorrect smoking rows and inserting corrected rows

    :param project_id: project id associated with the dataset to run the queries on
    :param dataset_id: dataset id to run the queries on
    :param sandbox_dataset_id: dataset id of the sandbox
    :return: list of query dicts
    """
    queries = []

    # fetch sandbox_dataset_id
    if sandbox_dataset_id is None:
        sandbox_dataset_id = sandbox.get_sandbox_dataset_id(dataset_id)

    load_smoking_lookup_table(project_id, sandbox_dataset_id)

    sandbox_query = dict()
    sandbox_query[cdr_consts.QUERY] = SANDBOX_CREATE_QUERY.format(
        project_id=project_id,
        combined_dataset_id=dataset_id,
        sandbox_dataset_id=sandbox_dataset_id,
        new_smoking_rows=NEW_SMOKING_ROWS,
        smoking_lookup_table=SMOKING_LOOKUP_TABLE)
    queries.append(sandbox_query)

    delete_query = dict()
    delete_query[cdr_consts.QUERY] = DELETE_INCORRECT_RECORDS.format(
        project_id=project_id,
        combined_dataset_id=dataset_id,
        sandbox_dataset_id=sandbox_dataset_id,
        smoking_lookup_table=SMOKING_LOOKUP_TABLE)
    queries.append(delete_query)

    insert_query = dict()
    insert_query[cdr_consts.QUERY] = INSERT_CORRECTED_RECORDS.format(
        project_id=project_id,
        combined_dataset_id=dataset_id,
        sandbox_dataset_id=sandbox_dataset_id,
        new_smoking_rows=NEW_SMOKING_ROWS)
    queries.append(insert_query)

    return queries
コード例 #8
0
    def setUpClass(cls):
        print('**************************************************************')
        print(cls.__name__)
        print('**************************************************************')

        super().initialize_class_vars()
        project_id = app_identity.get_application_id()
        dataset_id = bq_utils.get_rdr_dataset_id()
        sandbox_dataset_id = sandbox.get_sandbox_dataset_id(dataset_id)
        rule = PpiBranching(project_id, dataset_id, sandbox_dataset_id)
        cls.dataset_id = dataset_id
        cls.sandbox_dataset_id = sandbox_dataset_id
        cls.project_id = project_id
        cls.rule_instance = rule
        cls.fq_sandbox_table_names = [
            _fq_table_name(table)
            for table in (rule.lookup_table, rule.backup_table)
        ]
        cls.fq_table_names = [_fq_table_name(rule.observation_table)]
        super().setUpClass()
コード例 #9
0
 def delete_sandbox(self):
     sandbox_dataset_id = sandbox.get_sandbox_dataset_id(self.dataset_id)
     self.client.delete_dataset(sandbox_dataset_id,
                                delete_contents=True,
                                not_found_ok=True)