Exemple #1
0
 def setUp(self):
     self.project_id = app_identity.get_application_id()
     self.dataset_id = os.environ.get('UNIONED_DATASET_ID')
     self.sandbox_id = sandbox.get_sandbox_dataset_id(self.dataset_id)
     # Removing any existing datasets that might interfere with the test
     self.client = get_client(self.project_id)
     self.client.delete_dataset(f'{self.project_id}.{self.sandbox_id}',
                                delete_contents=True,
                                not_found_ok=True)
Exemple #2
0
def main(args=None):
    pipeline_logging.configure(logging.DEBUG, add_console_handler=True)
    args = parse_args(args)
    query_list = create_queries(args.project_id, args.ticket_number,
                                args.pids_project_id, args.pids_dataset_id,
                                args.pids_table, args.dataset_list)
    client = bq.get_client(args.project_id)
    run_queries(query_list, client)
    LOGGER.info("Retraction complete")
Exemple #3
0
 def setUp(self):
     self.project_id = app_identity.get_application_id()
     # this ensures the dataset is scoped appropriately in test and also
     # can be dropped in teardown (tests should not delete env resources)
     unioned_dataset_id = os.environ.get('UNIONED_DATASET_ID')
     self.dataset_id = f'{unioned_dataset_id}_bq_test'
     self.description = f'Dataset for {__name__} integration tests'
     self.label_or_tag = {'test': 'bq'}
     self.client = bq.get_client(self.project_id)
     self.dataset_ref = bigquery.dataset.DatasetReference(
         self.project_id, self.dataset_id)
 def setUp(self):
     self.hpo_id = 'fake'
     self.project_id = 'fake-project-id'
     self.test_project_id = app_identity.get_application_id()
     self.pid_table_id = 'pid_table'
     self.bq_dataset_id = os.environ.get('UNIONED_DATASET_ID')
     self.client = bq.get_client(self.test_project_id)
     self.dataset_ids = 'all_datasets'
     self.retraction_type = 'only_ehr'
     self.person_research_ids = [(1, 6890173), (2, 858761),
                                 (1234567, 4589763)]
Exemple #5
0
def run_deletion(project_id, name_substrings):
    """
    Deletes datasets from project containing any of the name_substrings

    :param project_id: identifies the project
    :param name_substrings: Identifies substrings that help identify datasets to delete
    :return:
    """
    # make the developer running this script approve the environment.
    msg = (f'This will remove datasets from the `{project_id}` '
           f'environment.\nAre you sure you want to proceed?  '
           f'[Y/y/N/n]:  ')

    LOGGER.info(msg)
    proceed = input(msg)

    LOGGER.info(f'User entered: "{proceed}"')

    if proceed.lower() != 'y':
        LOGGER.info(f'User requested to exit the deletion script.\n'
                    f'Exiting clean_project_datasets script now.')
        return

    LOGGER.info('Continuing with dataset deletions...')

    client = bq.get_client(project_id)

    all_datasets = [
        dataset.dataset_id for dataset in list(client.list_datasets())
    ]

    datasets_with_substrings = [
        dataset for dataset in all_datasets for substring in name_substrings
        if substring in dataset
    ]

    LOGGER.info(f'{len(datasets_with_substrings)} Datasets marked for '
                f'deletion in project `{project_id}`: ')
    for dataset in datasets_with_substrings:
        LOGGER.info(f'\t{dataset}')

    msg = (f'After reviewing datasets, proceed?\nYou will need to review '
           f'the log file if you are not printing to the console.\nThis action '
           f'cannot be reversed.\n'
           f'[Y/y/N/n]:  ')
    LOGGER.info(msg)
    response = input(msg)

    if response.lower() == 'y':
        _delete_datasets(client, datasets_with_substrings)
    else:
        LOGGER.info("Proper consent was not given.  Aborting deletion.")

    LOGGER.info("Dataset deletion completed.")
Exemple #6
0
def main(args=None):
    pipeline_logging.configure(logging.DEBUG, add_console_handler=True)
    parser = get_parser()
    args = parser.parse_args(args)
    client = bq.get_client(args.project_id)
    dataset_ids = ru.get_datasets_list(args.project_id, args.dataset_ids)
    LOGGER.info(
        f"Datasets to retract deactivated participants from: {dataset_ids}")
    run_deactivation(client, args.project_id, dataset_ids, args.fq_deact_table,
                     args.fq_pid_rid_table)
    LOGGER.info(
        f"Retraction of deactivated participants from {dataset_ids} complete")
 def setUp(self):
     self.project_id = app_identity.get_application_id()
     if 'test' not in self.project_id:
         raise RuntimeError(
             f"Make sure the project_id is set to test. Project_id is {self.project_id}"
         )
     self.dataset_id = os.environ.get('UNIONED_DATASET_ID')
     self.deact_dataset_id = os.environ.get('COMBINED_DATASET_ID')
     self.client = bq.get_client(self.project_id)
     self.bq_sandbox_dataset_id = sb.get_sandbox_dataset_id(self.dataset_id)
     self.tables = {**TABLE_ROWS, **MAPPING_TABLE_ROWS, **EXT_TABLE_ROWS}
     self.setup_data()
Exemple #8
0
def main(input_dataset_id, output_dataset_id, project_id, hpo_ids_ex=None):
    """
    Create a new CDM which is the union of all EHR datasets submitted by HPOs

    :param input_dataset_id identifies a dataset containing multiple CDMs, one for each HPO submission
    :param output_dataset_id identifies the dataset to store the new CDM in
    :param project_id: project containing the datasets
    :param hpo_ids_ex: (optional) list that identifies HPOs not to process, by default process all
    :returns: list of tables generated successfully
    """
    client = get_client(project_id)

    logging.info('EHR union started')
    # Get all hpo_ids.
    hpo_ids = [item['hpo_id'] for item in bq_utils.get_hpo_info()]
    if hpo_ids_ex:
        hpo_ids = [hpo_id for hpo_id in hpo_ids if hpo_id not in hpo_ids_ex]

    # Create empty output tables to ensure proper schema, clustering, etc.
    for table in resources.CDM_TABLES:
        result_table = output_table_for(table)
        logging.info(f'Creating {output_dataset_id}.{result_table}...')
        bq_utils.create_standard_table(table,
                                       result_table,
                                       drop_existing=True,
                                       dataset_id=output_dataset_id)

    # Create mapping tables
    for domain_table in cdm.tables_to_map():
        logging.info(f'Mapping {domain_table}...')
        mapping(domain_table, hpo_ids, input_dataset_id, output_dataset_id,
                project_id, client)

    # Load all tables with union of submitted tables
    for table_name in resources.CDM_TABLES:
        logging.info(f'Creating union of table {table_name}...')
        load(table_name, hpo_ids, input_dataset_id, output_dataset_id)

    logging.info('Creation of Unioned EHR complete')

    # create person mapping table
    domain_table = common.PERSON
    logging.info(f'Mapping {domain_table}...')
    mapping(domain_table, hpo_ids, input_dataset_id, output_dataset_id,
            project_id, client)

    logging.info('Starting process for Person to Observation')
    # Map and move EHR person records into four rows in observation, one each for race, ethnicity, dob and gender
    map_ehr_person_to_observation(output_dataset_id)
    move_ehr_person_to_observation(output_dataset_id)

    logging.info('Completed Person to Observation')
Exemple #9
0
def main(project_id,
         rdr_project_id,
         org_id=None,
         hpo_id=None,
         dataset_id=DRC_OPS):

    #Get list of hpos
    LOGGER.info('Getting hpo list...')
    if org_id:
        hpo_list = [{"hpo_id": hpo_id, "org_id": org_id}]
    else:
        hpo_list = get_hpo_info(project_id)

    LOGGER.info(hpo_list)

    for hpo in hpo_list:
        org_id = hpo['org_id']
        hpo_id = hpo['hpo_id']
        # Get participant summary data
        LOGGER.info(f'Getting participant summary data for {org_id}...')
        participant_info = get_org_participant_information(
            rdr_project_id, org_id)

        # Load schema and create ingestion time-partitioned table

        schema = bq.get_table_schema(PS_API_VALUES)
        tablename = f'{PS_API_VALUES}_{hpo_id}'

        client = bq.get_client(project_id)
        try:
            table = client.get_table(f'{project_id}.{dataset_id}.{tablename}')
        except NotFound:
            LOGGER.info(
                f'Creating table {project_id}.{dataset_id}.{tablename}...')

            table = bigquery.Table(f'{project_id}.{dataset_id}.{tablename}',
                                   schema=schema)
            table.time_partitioning = bigquery.TimePartitioning(
                type_=bigquery.TimePartitioningType.HOUR)
            table = client.create_table(table)

        # Insert summary data into table
        LOGGER.info(
            f'Storing participant data for {org_id} in table {project_id}.{dataset_id}.{tablename}...'
        )
        store_participant_data(participant_info,
                               project_id,
                               f'{dataset_id}.{tablename}',
                               schema=schema)

    LOGGER.info(f'Done.')
Exemple #10
0
def store_digital_health_status_data(project_id,
                                     json_data,
                                     destination_table,
                                     schema=None):
    """
    Stores the fetched digital_health_sharing_status data in a BigQuery dataset.

    If the table doesn't exist, it will create that table. If the table does exist,
    it will create a partition in the designated table or append to the same partition.
    This is necessary for storing data has "RECORD" type fields which do not conform to a dataframe.
    The data is stored using a JSON file object since it is one of the ways BigQuery expects it.
    :param project_id: identifies the project
    :param json_data: list of json objects retrieved from process_digital_health_data_to_json
    :param destination_table: fully qualified destination table name as 'project.dataset.table'
    :param schema: a list of SchemaField objects corresponding to the destination table

    :return: returns the bq job_id for the loading of digital health data
    """

    # Parameter check
    if not isinstance(project_id, str):
        raise RuntimeError(
            f'Please specify the project in which to create the table')

    client = get_client(project_id)
    if not schema:
        schema = get_table_schema(DIGITAL_HEALTH_SHARING_STATUS)

    try:
        table = client.get_table(destination_table)
    except NotFound:
        table = Table(destination_table, schema=schema)
        table.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.DAY)
        table = client.create_table(table)

    file_obj = StringIO()
    for json_obj in json_data:
        json.dump(json_obj, file_obj)
        file_obj.write('\n')
    job_config = LoadJobConfig(
        source_format=SourceFormat.NEWLINE_DELIMITED_JSON, schema=schema)
    job = client.load_table_from_file(file_obj,
                                      table,
                                      rewind=True,
                                      job_config=job_config,
                                      job_id_prefix='ps_digital_health_load_')
    job.result()

    return job.job_id
Exemple #11
0
    def setUpClass(cls):
        print('**************************************************************')
        print(cls.__name__)
        print('**************************************************************')

        cls.project_id = os.environ.get(PROJECT_ID)
        cls.dataset_id = os.environ.get('COMBINED_DATASET_ID')
        cls.client = get_client(cls.project_id)

        cls.hpo_id = 'fake_hpo'
        cls.org_id = 'fake_org'
        cls.ps_api_table = f'{PS_API_VALUES}_{cls.hpo_id}'

        cls.fq_table_names = [
            f'{cls.project_id}.{cls.dataset_id}.{cls.ps_api_table}'
        ]
Exemple #12
0
def check_and_create_sandbox_dataset(project_id, dataset_id):
    """
    A helper function to check if sandbox dataset exisits. If it does not, it will create.

    :param project_id: the project_id that the dataset is in
    :param dataset_id: the dataset_id to verify
    :return: the sandbox dataset_name that either exists or was created
    """
    client = get_client(project_id)
    sandbox_dataset = get_sandbox_dataset_id(dataset_id)
    dataset_objs = list(client.list_datasets(project_id))
    datasets = [d.dataset_id for d in dataset_objs]

    if sandbox_dataset not in datasets:
        create_sandbox_dataset(project_id, dataset_id)
    return sandbox_dataset
Exemple #13
0
    def setUp(self):
        self.project_id = app_identity.get_application_id()
        self.dataset_id = os.environ.get('UNIONED_DATASET_ID')

        self.fq_person = f'{self.project_id}.{self.dataset_id}.person'
        self.fq_person_ext = f'{self.project_id}.{self.dataset_id}.person_ext'
        self.table_ids = [self.fq_person, self.fq_person_ext]

        self.person = Table.from_string(self.fq_person)
        self.person_ext = Table.from_string(self.fq_person_ext)

        self.client = bq.get_client(self.project_id)
        for table in self.table_ids:
            self.client.delete_table(table, not_found_ok=True)
        bq.create_tables(self.client, self.project_id, self.table_ids)
        self.populate_tables()
Exemple #14
0
    def setUp(self):
        self.project_id = os.environ.get(PROJECT_ID)
        self.dataset_id = os.environ.get('COMBINED_DATASET_ID')
        self.dataset_ref = DatasetReference(self.project_id, self.dataset_id)
        self.client = bq.get_client(self.project_id)

        self.schema = [
            SchemaField("person_id", "INT64"),
            SchemaField("first_name", "STRING"),
            SchemaField("last_name", "STRING"),
            SchemaField("algorithm", "STRING")
        ]

        self.ps_api_fields = [
            dict(name='person_id', type='integer', mode='nullable'),
            dict(name='first_name', type='string', mode='nullable'),
            dict(name='last_name', type='string', mode='nullable')
        ]

        self.id_match_fields = [
            dict(name='person_id', type='integer', mode='nullable'),
            dict(name='first_name', type='string', mode='nullable'),
            dict(name='last_name', type='string', mode='nullable'),
            dict(name='algorithm', type='string', mode='nullable')
        ]

        self.hpo_id = 'fake_site'
        self.id_match_table_id = f'{IDENTITY_MATCH_TABLE}_{self.hpo_id}'
        self.ps_values_table_id = f'ps_api_values_{self.hpo_id}'

        # Create and populate the ps_values site table

        schema = bq.get_table_schema(PS_API_VALUES)
        tablename = self.ps_values_table_id

        table = Table(f'{self.project_id}.{self.dataset_id}.{tablename}',
                      schema=schema)
        table.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.HOUR)
        table = self.client.create_table(table)

        populate_query = POPULATE_PS_VALUES.render(
            project_id=self.project_id,
            drc_dataset_id=self.dataset_id,
            ps_values_table_id=self.ps_values_table_id)
        job = self.client.query(populate_query)
        job.result()
Exemple #15
0
def main():
    parser = get_arg_parser()
    args = parser.parse_args()

    #Set up pipeline logging
    pipeline_logging.configure(level=logging.DEBUG, add_console_handler=True)

    # get credentials and create client
    impersonation_creds = auth.get_impersonation_credentials(
        args.run_as_email, SCOPES)

    client = bq.get_client(args.project_id, credentials=impersonation_creds)

    # Populates the validation table for the site
    identify_rdr_ehr_match(client, args.project_id, args.hpo_id, EHR_OPS)

    LOGGER.info('Done.')
Exemple #16
0
def get_datasets_list(project_id, dataset_ids_list):
    """
    Returns list of dataset_ids on which to perform retraction

    Returns list of rdr, ehr, unioned, combined and deid dataset_ids and excludes sandbox and staging datasets
    :param project_id: identifies the project containing datasets to retract from
    :param dataset_ids_list: string of datasets to retract from separated by a space. If set to 'all_datasets',
        retracts from all datasets. If set to 'none', skips retraction from BigQuery datasets
    :return: List of dataset_ids
    :raises: AttributeError if dataset_ids_str does not allow .split()
    """
    client = bq.get_client(project_id)
    all_dataset_ids = [
        dataset.dataset_id
        for dataset in list(client.list_datasets(project_id))
    ]

    if not dataset_ids_list or dataset_ids_list == [consts.NONE]:
        dataset_ids = []
        LOGGER.info(
            "No datasets specified. Defaulting to empty list. Expect bucket only retraction."
        )
    elif dataset_ids_list == [consts.ALL_DATASETS]:
        dataset_ids = all_dataset_ids
        LOGGER.info(
            f"All datasets are specified. Setting dataset_ids to all datasets in project: {project_id}"
        )
    else:
        # only consider datasets that exist in the project
        dataset_ids = [
            dataset_id for dataset_id in dataset_ids_list
            if dataset_id in all_dataset_ids
        ]
        LOGGER.info(
            f"Datasets specified and existing in project {project_id}: {dataset_ids}"
        )

    # consider datasets containing PPI/EHR data, excluding sandbox/staging datasets
    dataset_ids = [
        dataset_id for dataset_id in dataset_ids
        if get_dataset_type(dataset_id) != common.OTHER
        and not is_sandbox_dataset(dataset_id)
    ]

    LOGGER.info(f"Found datasets to retract from: {', '.join(dataset_ids)}")
    return dataset_ids
Exemple #17
0
def main(raw_args=None):
    """
    Truncate and store fitbit data.

    Assumes you are passing arguments either via command line or a
    list.
    """
    parser = get_fitbit_parser()
    args, kwargs = clean_cdr.fetch_args_kwargs(parser, raw_args)

    pipeline_logging.configure(level=logging.INFO,
                               add_console_handler=args.console_log)

    # Identify the cleaning classes being run for specified data_stage
    # and validate if all the required arguments are supplied
    cleaning_classes = clean_cdr.DATA_STAGE_RULES_MAPPING[consts.FITBIT]
    clean_cdr.validate_custom_params(cleaning_classes, **kwargs)

    # get credentials and create client
    impersonation_creds = auth.get_impersonation_credentials(
        args.run_as_email, SCOPES)

    client = bq.get_client(args.project_id, credentials=impersonation_creds)

    # create staging, sandbox, backup and clean datasets with descriptions and labels
    fitbit_datasets = create_fitbit_datasets(client, args.release_tag)

    copy_fitbit_tables_from_views(client,
                                  args.fitbit_dataset,
                                  fitbit_datasets[consts.BACKUP],
                                  table_prefix='v_')
    bq.copy_datasets(client, fitbit_datasets[consts.BACKUP],
                     fitbit_datasets[consts.STAGING])

    common_cleaning_args = [
        '-p', args.project_id, '-d', fitbit_datasets[consts.STAGING], '-b',
        fitbit_datasets[consts.SANDBOX], '-s', '-a', consts.FITBIT
    ]
    fitbit_cleaning_args = args_parser.add_kwargs_to_args(
        common_cleaning_args, kwargs)

    clean_cdr.main(args=fitbit_cleaning_args)

    # Snapshot the staging dataset to final dataset
    bq.build_and_copy_contents(client, fitbit_datasets[consts.STAGING],
                               fitbit_datasets[consts.CLEAN])
Exemple #18
0
    def load_test_data(self, df, project_id, dataset_id, table):
        """
        Add data to the tables for the rule to run on.

        :param df: a dataframe containing data to insert
        :param project_id
        :param dataset_id
        :param table
        """
        client = get_client(project_id)
        schema = get_table_schema(table)
        schema = [field for field in schema if field.name in list(df.columns)]
        load_job_config = LoadJobConfig(schema=schema)
        load_job = client.load_table_from_dataframe(df,
                                                    f'{dataset_id}.{table}',
                                                    job_config=load_job_config)
        load_job.result()
def count_pid_rows_in_dataset(project_id, dataset_id, hpo_id, pid_source):
    """
    Returns df containing tables and counts of participant rows for pids in pids_source

    :param project_id: identifies the project
    :param dataset_id: identifies the dataset
    :param hpo_id: Identifies the hpo site that submitted the pids
    :param pid_source: string containing query or list containing pids
    :return: df with headers table_id, all_counts, all_ehr_counts, and map_ehr_counts
    """
    dataset_type = ru.get_dataset_type(dataset_id)
    counts_df = pd.DataFrame(columns=[
        ru_consts.TABLE_ID, consts.ALL_COUNT, consts.ALL_EHR_COUNT,
        consts.MAP_EHR_COUNT
    ])
    bq_client = bq.get_client(project_id)
    cols_query = bq.dataset_columns_query(project_id, dataset_id)
    table_df = bq_client.query(cols_query).to_dataframe()

    if dataset_type == common.COMBINED:
        query = get_combined_deid_query(project_id, dataset_id, pid_source,
                                        table_df)
    elif dataset_type == common.DEID or dataset_type == common.RELEASE:
        query = get_combined_deid_query(project_id,
                                        dataset_id,
                                        pid_source,
                                        table_df,
                                        for_deid=True)
    elif dataset_type == common.EHR:
        query = get_ehr_query(project_id, dataset_id, pid_source, hpo_id,
                              table_df)
    elif dataset_type == common.RDR:
        query = get_dataset_query(project_id,
                                  dataset_id,
                                  pid_source,
                                  table_df,
                                  for_rdr=True)
    else:
        query = get_dataset_query(project_id, dataset_id, pid_source, table_df)

    if query:
        counts_df = bq.query(query, project_id)
        # sort by count desc
        counts_df = counts_df.sort_values(by=consts.ALL_COUNT, ascending=False)
    return counts_df
 def setUp(self):
     self.hpo_id = 'fake'
     self.project_id = app_identity.get_application_id()
     if 'test' not in self.project_id:
         raise RuntimeError(
             f"Make sure the project_id is set to test.  project_id is {self.project_id}"
         )
     self.bq_dataset_id = bq_utils.get_dataset_id()
     self.bq_sandbox_dataset_id = get_sandbox_dataset_id(self.bq_dataset_id)
     self.ticket_number = 'DCXXX'
     self.pid_table_id = 'pid_table'
     self.pid_table_id_list = [
         self.project_id + '.' + self.bq_dataset_id + '.' + 'pid_table'
     ]
     self.deactivated_ehr_participants = [(1, '2010-01-01'),
                                          (2, '2010-01-01'),
                                          (5, '2010-01-01')]
     self.client = bq.get_client(self.project_id)
    def setUp(self):
        self.hpo_bucket = gcs_utils.get_hpo_bucket(FAKE_HPO_ID)
        self.project_id = app_identity.get_application_id()
        self.dataset_id = bq_utils.get_dataset_id()
        self.rdr_dataset_id = bq_utils.get_rdr_dataset_id()
        self.folder_prefix = '2019-01-01/'
        test_util.delete_all_tables(self.dataset_id)
        test_util.empty_bucket(self.hpo_bucket)

        self.client = bq.get_client(self.project_id)

        mock_get_hpo_name = mock.patch('validation.main.get_hpo_name')

        self.mock_get_hpo_name = mock_get_hpo_name.start()
        self.mock_get_hpo_name.return_value = 'Fake HPO'
        self.addCleanup(mock_get_hpo_name.stop)

        self._load_data()
def main():
    parser = get_arg_parser()
    args = parser.parse_args()

    # get credentials and create client
    impersonation_creds = auth.get_impersonation_credentials(
        args.run_as_email, SCOPES)

    client = bq.get_client(args.project_id, credentials=impersonation_creds)

    table_id = f'{IDENTITY_MATCH_TABLE}_{args.hpo_id}'

    # Creates hpo_site identity match table if it does not exist
    if not table_exists(table_id, DRC_OPS):
        create_drc_validation_table(client, args.project_id, table_id)

    # Populates the validation table for the site
    populate_validation_table(client, args.project_id, table_id, args.hpo_id)
Exemple #23
0
def load_deid_map_table(deid_map_dataset_name, age_limit):

    # Create _deid_map table in input dataset
    project_id = app_identity.get_application_id()
    client = bq.get_client(project_id)
    deid_map_table = f'{project_id}.{deid_map_dataset_name}.{DEID_MAP_TABLE}'
    # Copy master _deid_map table records to _deid_map table
    if bq_utils.table_exists(DEID_MAP_TABLE,
                             dataset_id=PIPELINE_TABLES_DATASET):
        copy_deid_map_table(deid_map_table, project_id,
                            PIPELINE_TABLES_DATASET, deid_map_dataset_name,
                            age_limit, client)
        logging.info(
            f"copied participants younger than {age_limit} to the table {deid_map_table}"
        )
    else:
        raise RuntimeError(
            f'{DEID_MAP_TABLE} is not available in {project_id}.{PIPELINE_TABLES_DATASET}'
        )
    def setUp(self):
        self.project_id = app_identity.get_application_id()
        self.dataset_id = bq_utils.get_dataset_id()
        self.sandbox_id = check_and_create_sandbox_dataset(
            self.project_id, self.dataset_id)
        self.tablename = '_deactivated_participants'
        self.ticket_number = 'DC12345'

        self.columns = ['participantId', 'suspensionStatus', 'suspensionTime']

        self.deactivated_participants = [(1, 'NO_CONTACT', '2018-12-07'),
                                         (2, 'NO_CONTACT', '2019-12-07'),
                                         (3, 'NO_CONTACT', '2017-12-07')]

        self.json_response_entry = {
            'entry': [{
                'fullUrl':
                'https//foo_project.appspot.com/rdr/v1/Participant/P1/Summary',
                'resource': {
                    'participantId': 'P1',
                    'suspensionStatus': 'NO_CONTACT',
                    'suspensionTime': '2018-12-07T08:21:14'
                }
            }, {
                'fullUrl':
                'https//foo_project.appspot.com/rdr/v1/Participant/P2/Summary',
                'resource': {
                    'participantId': 'P2',
                    'suspensionStatus': 'NO_CONTACT',
                    'suspensionTime': '2019-12-07T08:21:14'
                }
            }, {
                'fullUrl':
                'https//foo_project.appspot.com/rdr/v1/Participant/P3/Summary',
                'resource': {
                    'participantId': 'P3',
                    'suspensionStatus': 'NO_CONTACT',
                    'suspensionTime': '2017-12-07T08:21:14'
                }
            }]
        }

        self.client = bq.get_client(self.project_id)
def main(project_id, dataset_id, bucket_name, hpo_id, folder_name):
    """
    Main function to load submission into dataset

    :param project_id: Identifies the project
    :param dataset_id: Identifies the destination dataset
    :param bucket_name: the bucket in GCS containing the archive files
    :param hpo_id: Identifies the HPO site
    :param folder_name: Name of the submission folder to load
    :return:
    """
    bq_client = get_client(project_id)
    gcs_client = GCSClient(project_id)
    site_bucket = get_bucket(bq_client, hpo_id)
    prefix = f'{hpo_id}/{site_bucket}/{folder_name}'
    LOGGER.info(
        f'Starting jobs for loading {bucket_name}/{prefix} into {dataset_id}')
    _ = load_folder(dataset_id, bq_client, bucket_name, prefix, gcs_client,
                    hpo_id)
    LOGGER.info(f'Successfully loaded {bucket_name}/{prefix} into {dataset_id}')
Exemple #26
0
def main(project_id: str, bucket_name: str, vocab_folder_path: str,
         dst_dataset_id: str):
    """
    Load and transform vocabulary files in GCS to a BigQuery dataset

    :param project_id: Identifies the BQ project
    :param bucket_name: refers to the bucket containing vocabulary files
    :param vocab_folder_path: points to the directory containing files downloaded from athena with CPT4 applied
    :param dst_dataset_id: final destination to load the vocabulary in BigQuery
    """
    bq_client = bq.get_client(project_id)
    gcs_client = storage.Client(project_id)
    vocab_folder_path = Path(vocab_folder_path)
    update_aou_vocabs(vocab_folder_path)
    upload_stage(bucket_name, vocab_folder_path, gcs_client)
    staging_dataset = check_and_create_staging_dataset(dst_dataset_id,
                                                       bucket_name, bq_client)
    load_stage(staging_dataset, bq_client, bucket_name, gcs_client)
    load(project_id, bq_client, staging_dataset.dataset_id, dst_dataset_id)
    return
def main(first_n):

    pipeline_logging.configure(logging.INFO, add_console_handler=True)

    bq_client = bq.get_client(os.environ.get('GOOGLE_CLOUD_PROJECT'))

    _check_project(bq_client)

    datasets_to_delete = _filter_stale_datasets(bq_client, first_n)

    for stale_dataset in datasets_to_delete:

        LOGGER.info(f"Running - bq_client.delete_dataset({stale_dataset})")

        try:
            bq_client.delete_dataset(stale_dataset)
        except exceptions.BadRequest as e:
            LOGGER.warning(
                f"Failed to delete {stale_dataset}. Message: {e.message}")

    return datasets_to_delete
Exemple #28
0
def copy_tables_to_new_dataset(project_id, dataset_id, snapshot_dataset_id):
    """
    lists the tables in the dataset and copies each table to a new dataset.
    :param dataset_id:
    :param project_id:
    :param snapshot_dataset_id:
    :return:
    """
    copy_table_job_ids = []
    client = bq.get_client(project_id)
    for table_id in list_all_table_ids(dataset_id):
        q = get_copy_table_query(project_id, dataset_id, table_id, client)
        results = query(q,
                        use_legacy_sql=False,
                        destination_table_id=table_id,
                        destination_dataset_id=snapshot_dataset_id,
                        batch=True)
        copy_table_job_ids.append(results['jobReference']['jobId'])
    incomplete_jobs = wait_on_jobs(copy_table_job_ids)
    if len(incomplete_jobs) > 0:
        raise BigQueryJobWaitError(incomplete_jobs)
Exemple #29
0
def get_domain_mapping_queries(project_id, dataset_id):
    """
    This function generates a list of query dicts for creating id mappings in _logging_domain_alignment.
    The list will get consumed clean_engine

    :param project_id: the project_id in which the query is run
    :param dataset_id: the dataset_id in which the query is run
    :return: a list of query dicts for creating id mappings in _logging_domain_alignment
    """
    # Create _logging_domain_alignment
    client = bq.get_client(project_id)
    table_id = f'{project_id}.{dataset_id}.{DOMAIN_ALIGNMENT_TABLE_NAME}'
    client.delete_table(table_id, not_found_ok=True)
    bq.create_tables(client, project_id, [table_id], exists_ok=False)

    domain_mapping_queries = []

    for domain_table in domain_mapping.DOMAIN_TABLE_NAMES:
        query = parse_domain_mapping_query_cross_domain(
            project_id, dataset_id, domain_table)
        domain_mapping_queries.append(query)

    # Create the query for creating field_mappings for the records moving between the same domain
    query = parse_domain_mapping_query_for_same_domains(project_id, dataset_id)
    domain_mapping_queries.append(query)

    # Create the query for the records that are in the wrong domain but will not be moved
    query = parse_domain_mapping_query_for_excluded_records(
        project_id, dataset_id)
    domain_mapping_queries.append(query)

    unioned_query = {
        cdr_consts.QUERY: UNION_ALL.join(domain_mapping_queries),
        cdr_consts.DESTINATION_TABLE: DOMAIN_ALIGNMENT_TABLE_NAME,
        cdr_consts.DISPOSITION: bq_consts.WRITE_EMPTY,
        cdr_consts.DESTINATION_DATASET: dataset_id
    }

    return [unioned_query]
Exemple #30
0
def clean_dataset(project_id,
                  dataset_id,
                  sandbox_dataset_id,
                  rules,
                  table_namer='',
                  **kwargs):
    """
    Run the assigned cleaning rules and return list of BQ job objects

    :param project_id: identifies the project
    :param dataset_id: identifies the dataset to clean
    :param sandbox_dataset_id: identifies the sandbox dataset to store backup rows
    :param rules: a list of cleaning rule objects/functions as tuples
    :param table_namer: source differentiator value expected to be the same for all rules run on the same dataset
    :param kwargs: keyword arguments a cleaning rule may require
    :return all_jobs: List of BigQuery job objects
    """
    # Set up client
    client = bq.get_client(project_id=project_id)

    all_jobs = []
    for rule_index, rule in enumerate(rules):
        clazz = rule[0]
        query_function, setup_function, rule_info = infer_rule(
            clazz, project_id, dataset_id, sandbox_dataset_id, table_namer,
            **kwargs)

        LOGGER.info(
            f"Applying cleaning rule {rule_info[cdr_consts.MODULE_NAME]} "
            f"{rule_index+1}/{len(rules)}")
        setup_function(client)
        query_list = query_function()
        jobs = run_queries(client, query_list, rule_info)
        LOGGER.info(
            f"For clean rule {rule_info[cdr_consts.MODULE_NAME]}, {len(jobs)} jobs "
            f"were run successfully for {len(query_list)} queries")
        all_jobs.extend(jobs)
    return all_jobs