Example #1
0
def store_participant_data(df, project_id, destination_table, schema=None):
    """
    Stores the fetched participant data in a BigQuery dataset. If the
    table doesn't exist, it will create that table. If the table does
    exist, it will append the data onto that designated table.

    :param df: pandas dataframe created to hold participant data fetched from ParticipantSummary API
    :param project_id: identifies the project
    :param destination_table: name of the table to be written in the form of dataset.tablename
    :param schema: a list of SchemaField objects corresponding to the destination table

    :return: returns the bq job_id for the loading of participant data
    """

    # Parameter check
    if not isinstance(project_id, str):
        raise RuntimeError(
            f'Please specify the project in which to create the tables')

    client = get_client(project_id)
    if not schema:
        schema = get_table_schema(destination_table.split('.')[-1])

    # Dataframe data fields must be of type datetime
    df = set_dataframe_date_fields(df, schema)

    load_job_config = LoadJobConfig(schema=schema)
    job = client.load_table_from_dataframe(df,
                                           destination_table,
                                           job_config=load_job_config)
    job.result()

    return job.job_id
Example #2
0
 def setUp(self) -> None:
     self.project_id = 'fake_project'
     self.dataset_id = 'fake_dataset'
     self.sandbox_dataset_id = 'fake_sandbox'
     self.observation_schema = get_table_schema('observation')
     self.cleaning_rule = PpiBranching(self.project_id, self.dataset_id,
                                       self.sandbox_dataset_id)
def populate_validation_table(client,
                              project_id,
                              table_id,
                              hpo_id,
                              drc_dataset_id=DRC_OPS):
    """
    Populates validation table with 'missing_rdr' or 'missing_ehr' data. Populated with 'missing_rdr' if data IS NOT
        found in the ps_values table. Populated with 'missing_ehr' as default.

    :param client: bq client
    :param project_id: the project containing the dataset
    :param table_id: ID for the table
    :param hpo_id: ID for the HPO site
    """

    schema_list = bq.get_table_schema(IDENTITY_MATCH_TABLE)
    id_match_table_id = table_id
    ps_values_table_id = f'{PS_API_VALUES}_{hpo_id}'

    fields_name_str = ', '.join([item.name for item in schema_list])

    populate_query = POPULATE_VALIDATION_TABLE.render(
        project_id=project_id,
        drc_dataset_id=drc_dataset_id,
        id_match_table_id=id_match_table_id,
        fields=fields_name_str,
        case_statements=get_case_statements(),
        ps_values_table_id=ps_values_table_id)

    job = client.query(populate_query)
    job.result()

    LOGGER.info(f'Populated values in `{id_match_table_id}`')
Example #4
0
def load(project_id, bq_client, src_dataset_id, dst_dataset_id):
    """
    Transform safely loaded tables and store results in target dataset.

    :param project_id: Identifies the BQ project
    :param bq_client: a BigQuery client object
    :param src_dataset_id: reference to source dataset object
    :param dst_dataset_id: reference to destination dataset object
    :return: List of BQ job_ids
    """
    dst_dataset = Dataset(f'{bq_client.project}.{dst_dataset_id}')
    dst_dataset.description = f'Vocabulary cleaned and loaded from {src_dataset_id}'
    dst_dataset.labels = {'type': 'vocabulary'}
    dst_dataset.location = "US"
    bq_client.create_dataset(dst_dataset, exists_ok=True)
    src_tables = list(bq_client.list_tables(dataset=src_dataset_id))

    job_config = QueryJobConfig()
    query_jobs = []
    for src_table in src_tables:
        schema = bq.get_table_schema(src_table.table_id)
        destination = f'{project_id}.{dst_dataset_id}.{src_table.table_id}'
        table = bq_client.create_table(Table(destination, schema=schema),
                                       exists_ok=True)
        job_config.destination = table
        query = SELECT_TPL.render(project_id=project_id,
                                  dataset_id=src_dataset_id,
                                  table=src_table.table_id,
                                  fields=schema)
        query_job = bq_client.query(query, job_config=job_config)
        LOGGER.info(f'table:{destination} job_id:{query_job.job_id}')
        query_jobs.append(query_job)
        query_job.result()
    return query_jobs
Example #5
0
def copy_fitbit_tables_from_views(client, from_dataset, to_dataset,
                                  table_prefix):
    """
    Copies tables from views with prefix

    :param client: bq client
    :param from_dataset: dataset containing views
    :param to_dataset: dataset to create tables
    :param table_prefix: prefix added to table_ids
    :return:
    """
    for table in FITBIT_TABLES:
        schema_list = bq.get_table_schema(table)
        fq_dest_table = f'{client.project}.{to_dataset}.{table}'
        dest_table = Table(fq_dest_table, schema=schema_list)
        dest_table = client.create_table(dest_table)
        LOGGER.info(f'Created empty table {fq_dest_table}')

        fields_name_str = ',\n'.join([item.name for item in schema_list])
        fields_casted_str = ',\n'.join([
            cast_to_schema_type(item.name, item.field_type)
            for item in schema_list
        ])
        content_query = INSERT_QUERY.render(fq_dest_table=fq_dest_table,
                                            fields=fields_name_str,
                                            fields_casted=fields_casted_str,
                                            client=client,
                                            from_dataset=from_dataset,
                                            table_prefix=table_prefix,
                                            table=table)
        job = client.query(content_query)
        job.result()

    LOGGER.info(f'Copied fitbit tables from `{from_dataset}` to `{to_dataset}`')
Example #6
0
def safe_schema_for(table: str) -> List[SchemaField]:
    """
    Get schema fields whose date[time] fields are converted to strings so load will work

    :param table: name of the table
    :return: a list of SchemaField objects
    """
    return [
        SchemaField(
            f.name, 'string' if f.field_type.lower() in DATE_TIME_TYPES else
            f.field_type, f.mode, f.description)
        for f in bq.get_table_schema(table)
    ]
Example #7
0
def main(project_id,
         rdr_project_id,
         org_id=None,
         hpo_id=None,
         dataset_id=DRC_OPS):

    #Get list of hpos
    LOGGER.info('Getting hpo list...')
    if org_id:
        hpo_list = [{"hpo_id": hpo_id, "org_id": org_id}]
    else:
        hpo_list = get_hpo_info(project_id)

    LOGGER.info(hpo_list)

    for hpo in hpo_list:
        org_id = hpo['org_id']
        hpo_id = hpo['hpo_id']
        # Get participant summary data
        LOGGER.info(f'Getting participant summary data for {org_id}...')
        participant_info = get_org_participant_information(
            rdr_project_id, org_id)

        # Load schema and create ingestion time-partitioned table

        schema = bq.get_table_schema(PS_API_VALUES)
        tablename = f'{PS_API_VALUES}_{hpo_id}'

        client = bq.get_client(project_id)
        try:
            table = client.get_table(f'{project_id}.{dataset_id}.{tablename}')
        except NotFound:
            LOGGER.info(
                f'Creating table {project_id}.{dataset_id}.{tablename}...')

            table = bigquery.Table(f'{project_id}.{dataset_id}.{tablename}',
                                   schema=schema)
            table.time_partitioning = bigquery.TimePartitioning(
                type_=bigquery.TimePartitioningType.HOUR)
            table = client.create_table(table)

        # Insert summary data into table
        LOGGER.info(
            f'Storing participant data for {org_id} in table {project_id}.{dataset_id}.{tablename}...'
        )
        store_participant_data(participant_info,
                               project_id,
                               f'{dataset_id}.{tablename}',
                               schema=schema)

    LOGGER.info(f'Done.')
Example #8
0
def store_digital_health_status_data(project_id,
                                     json_data,
                                     destination_table,
                                     schema=None):
    """
    Stores the fetched digital_health_sharing_status data in a BigQuery dataset.

    If the table doesn't exist, it will create that table. If the table does exist,
    it will create a partition in the designated table or append to the same partition.
    This is necessary for storing data has "RECORD" type fields which do not conform to a dataframe.
    The data is stored using a JSON file object since it is one of the ways BigQuery expects it.
    :param project_id: identifies the project
    :param json_data: list of json objects retrieved from process_digital_health_data_to_json
    :param destination_table: fully qualified destination table name as 'project.dataset.table'
    :param schema: a list of SchemaField objects corresponding to the destination table

    :return: returns the bq job_id for the loading of digital health data
    """

    # Parameter check
    if not isinstance(project_id, str):
        raise RuntimeError(
            f'Please specify the project in which to create the table')

    client = get_client(project_id)
    if not schema:
        schema = get_table_schema(DIGITAL_HEALTH_SHARING_STATUS)

    try:
        table = client.get_table(destination_table)
    except NotFound:
        table = Table(destination_table, schema=schema)
        table.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.DAY)
        table = client.create_table(table)

    file_obj = StringIO()
    for json_obj in json_data:
        json.dump(json_obj, file_obj)
        file_obj.write('\n')
    job_config = LoadJobConfig(
        source_format=SourceFormat.NEWLINE_DELIMITED_JSON, schema=schema)
    job = client.load_table_from_file(file_obj,
                                      table,
                                      rewind=True,
                                      job_config=job_config,
                                      job_id_prefix='ps_digital_health_load_')
    job.result()

    return job.job_id
Example #9
0
 def copy_vocab_tables(cls, vocabulary_id):
     """
     A function for copying the vocab tables to the test dataset_id
     :param vocabulary_id: 
     :return: 
     """
     # Copy vocab tables over to the test dataset
     vocabulary_dataset = cls.client.get_dataset(vocabulary_id)
     for src_table in bq.list_tables(cls.client, vocabulary_dataset):
         schema = bq.get_table_schema(src_table.table_id)
         destination = f'{cls.project_id}.{cls.dataset_id}.{src_table.table_id}'
         dst_table = cls.client.create_table(Table(destination,
                                                   schema=schema),
                                             exists_ok=True)
         cls.client.copy_table(src_table, dst_table)
Example #10
0
    def backup_rows_to_drop_ddl(self) -> str:
        """
        Get a DDL statement which loads a backup table with rows to be dropped

        :return: the DDL statement
        """
        observation_schema = bq.get_table_schema(OBSERVATION)
        query = BACKUP_ROWS_QUERY.render(lookup_table=self.lookup_table,
                                         src_table=self.observation_table)
        return bq.get_create_or_replace_table_ddl(
            project_id=self.backup_table.project,
            dataset_id=self.backup_table.dataset_id,
            table_id=self.backup_table.table_id,
            schema=observation_schema,
            as_query=query)
Example #11
0
    def stage_to_target_ddl(self) -> str:
        """
        Get a DDL statement which drops and creates the observation
        table with rows from stage

        :return: the DDL statement
        """
        observation_schema = bq.get_table_schema(OBSERVATION)
        stage = self.stage_table
        query = f'''SELECT * FROM `{stage.project}.{stage.dataset_id}.{stage.table_id}`'''
        return bq.get_create_or_replace_table_ddl(
            project_id=self.observation_table.project,
            dataset_id=self.observation_table.dataset_id,
            schema=observation_schema,
            table_id=self.observation_table.table_id,
            as_query=query)
Example #12
0
    def setUp(self):
        self.project_id = os.environ.get(PROJECT_ID)
        self.dataset_id = os.environ.get('COMBINED_DATASET_ID')
        self.dataset_ref = DatasetReference(self.project_id, self.dataset_id)
        self.client = bq.get_client(self.project_id)

        self.schema = [
            SchemaField("person_id", "INT64"),
            SchemaField("first_name", "STRING"),
            SchemaField("last_name", "STRING"),
            SchemaField("algorithm", "STRING")
        ]

        self.ps_api_fields = [
            dict(name='person_id', type='integer', mode='nullable'),
            dict(name='first_name', type='string', mode='nullable'),
            dict(name='last_name', type='string', mode='nullable')
        ]

        self.id_match_fields = [
            dict(name='person_id', type='integer', mode='nullable'),
            dict(name='first_name', type='string', mode='nullable'),
            dict(name='last_name', type='string', mode='nullable'),
            dict(name='algorithm', type='string', mode='nullable')
        ]

        self.hpo_id = 'fake_site'
        self.id_match_table_id = f'{IDENTITY_MATCH_TABLE}_{self.hpo_id}'
        self.ps_values_table_id = f'ps_api_values_{self.hpo_id}'

        # Create and populate the ps_values site table

        schema = bq.get_table_schema(PS_API_VALUES)
        tablename = self.ps_values_table_id

        table = Table(f'{self.project_id}.{self.dataset_id}.{tablename}',
                      schema=schema)
        table.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.HOUR)
        table = self.client.create_table(table)

        populate_query = POPULATE_PS_VALUES.render(
            project_id=self.project_id,
            drc_dataset_id=self.dataset_id,
            ps_values_table_id=self.ps_values_table_id)
        job = self.client.query(populate_query)
        job.result()
Example #13
0
    def load_test_data(self, df, project_id, dataset_id, table):
        """
        Add data to the tables for the rule to run on.

        :param df: a dataframe containing data to insert
        :param project_id
        :param dataset_id
        :param table
        """
        client = get_client(project_id)
        schema = get_table_schema(table)
        schema = [field for field in schema if field.name in list(df.columns)]
        load_job_config = LoadJobConfig(schema=schema)
        load_job = client.load_table_from_dataframe(df,
                                                    f'{dataset_id}.{table}',
                                                    job_config=load_job_config)
        load_job.result()
Example #14
0
    def stage_cleaned_table_ddl(self) -> str:
        """
        Get a DDL statement which stages cleaned table

        Note: This avoids potential partitioning mismatch error
              when directly overwriting observation table

        :return: the DDL statement
        """
        observation_schema = bq.get_table_schema(OBSERVATION)
        query = CLEANED_ROWS_QUERY.render(src=self.observation_table,
                                          backup=self.backup_table)
        return bq.get_create_or_replace_table_ddl(
            project_id=self.stage_table.project,
            dataset_id=self.stage_table.dataset_id,
            table_id=self.stage_table.table_id,
            schema=observation_schema,
            as_query=query)
Example #15
0
    def get_query_specs(self, *args, **keyword_args) -> query_spec_list:
        """
        Return a list of dictionary query specifications.

        :return:  A list of dictionaries. Each dictionary contains a single query
            and a specification for how to execute that query. The specifications
            are optional but the query is required.
        """
        queries = []
        for table in self.affected_tables:
            schema = bq.get_table_schema(table)
            statements = []
            for item in schema:
                if item.name in fields:
                    if item.mode.lower() == 'nullable':
                        value = 'NULL'
                    elif item.field_type.lower() == 'integer':
                        value = 0
                    elif item.field_type.lower() == 'string':
                        value = ''
                    else:
                        raise RuntimeError(
                            f"Required field {item.name} needs to be integer or string type to be replaced"
                        )
                    suppression_statement = REPLACE_STRING.render(
                        suppression_statement=value, field=item.name)
                    statements.append(suppression_statement)
            if statements:
                suppression_statement = ', '.join(statements)
                query = dict()
                query[cdr_consts.QUERY] = ID_FIELD_SUPPRESSION_QUERY.render(
                    project_id=self.project_id,
                    dataset_id=self.dataset_id,
                    table=table,
                    replace_statement=suppression_statement)
                query[cdr_consts.DESTINATION_TABLE] = table
                query[cdr_consts.DISPOSITION] = bq_consts.WRITE_TRUNCATE
                query[cdr_consts.DESTINATION_DATASET] = self.dataset_id
                queries.append(query)

            else:
                continue
        return queries
def load_folder(dst_dataset: str, bq_client: BQClient, bucket_name: str,
                prefix: str, gcs_client: GCSClient,
                hpo_id: str) -> List[LoadJob]:
    """
    Stage files from a bucket to a dataset

    :param dst_dataset: Identifies the destination dataset
    :param bq_client: a BigQuery client object
    :param bucket_name: the bucket in GCS containing the archive files
    :param prefix: prefix of the filepath URI
    :param gcs_client: a Cloud Storage client object
    :param hpo_id: Identifies the HPO site
    :return: list of completed load jobs
    """
    blobs = list(gcs_client.list_blobs(bucket_name, prefix=prefix))

    load_jobs = []
    for blob in blobs:
        table_name = _filename_to_table_name(blob.name)
        if table_name not in AOU_REQUIRED:
            LOGGER.debug(f'Skipping file for {table_name}')
            continue
        schema = get_table_schema(table_name)
        hpo_table_name = f'{hpo_id}_{table_name}'
        fq_hpo_table = f'{bq_client.project}.{dst_dataset}.{hpo_table_name}'
        destination = Table(fq_hpo_table, schema=schema)
        destination = bq_client.create_table(destination)
        job_config = LoadJobConfig()
        job_config.schema = schema
        job_config.skip_leading_rows = 1
        job_config.source_format = 'CSV'
        source_uri = f'gs://{bucket_name}/{blob.name}'
        load_job = bq_client.load_table_from_uri(
            source_uri,
            destination,
            job_config=job_config,
            job_id_prefix=f"{__file__.split('/')[-1].split('.')[0]}_")
        LOGGER.info(f'table:{destination} job_id:{load_job.job_id}')
        load_jobs.append(load_job)
        load_job.result()
    return load_jobs
Example #17
0
    def test_get_table_ddl(self):
        # Schema is determined by table name
        ddl = bq.get_create_or_replace_table_ddl(self.project_id,
                                                 self.dataset_id,
                                                 'observation').strip()
        self.assertTrue(
            ddl.startswith(
                f'CREATE OR REPLACE TABLE `{self.project_id}.{self.dataset_id}.observation`'
            ))
        self.assertTrue(ddl.endswith(')'))

        # Explicitly provided table name and schema are rendered
        observation_schema = bq.get_table_schema('observation')
        ddl = bq.get_create_or_replace_table_ddl(
            self.project_id,
            self.dataset_id,
            table_id='custom_observation',
            schema=observation_schema).strip()
        self.assertTrue(
            ddl.startswith(
                f'CREATE OR REPLACE TABLE `{self.project_id}.{self.dataset_id}.custom_observation`'
            ))
        # Sanity check that observation schema is rendered
        self.assertTrue(
            all(field.description in ddl for field in observation_schema))
        self.assertTrue(ddl.endswith(')'))

        # Parameter as_query is rendered
        fake_as_query = "SELECT 1 FROM fake"
        ddl = bq.get_create_or_replace_table_ddl(
            self.project_id,
            self.dataset_id,
            'observation',
            as_query=fake_as_query).strip()
        self.assertTrue(
            ddl.startswith(
                f'CREATE OR REPLACE TABLE `{self.project_id}.{self.dataset_id}.observation`'
            ))
        self.assertTrue(ddl.endswith(fake_as_query))
Example #18
0
def load(project_id,
         bq_client,
         src_dataset_id,
         dst_dataset_id,
         overwrite_ok=False):
    """
    Transform safely loaded tables and store results in target dataset.

    :param project_id:
    :param bq_client:
    :param src_dataset_id:
    :param dst_dataset_id:
    :param overwrite_ok: if True and the dest dataset already exists the dataset is recreated
    :return:
    """
    if overwrite_ok:
        bq_client.delete_dataset(dst_dataset_id,
                                 delete_contents=True,
                                 not_found_ok=True)
    bq_client.create_dataset(dst_dataset_id)
    src_tables = list(bq_client.list_tables(dataset=src_dataset_id))

    job_config = QueryJobConfig()
    query_jobs = []
    for src_table in src_tables:
        schema = bq.get_table_schema(src_table.table_id)
        destination = f'{project_id}.{dst_dataset_id}.{src_table.table_id}'
        table = bq_client.create_table(Table(destination, schema=schema),
                                       exists_ok=True)
        job_config.destination = table
        query = SELECT_TPL.render(project_id=project_id,
                                  dataset_id=src_dataset_id,
                                  table=src_table.table_id,
                                  fields=schema)
        query_job = bq_client.query(query, job_config=job_config)
        LOGGER.info(f'table:{destination} job_id:{query_job.job_id}')
        query_jobs.append(query_job)
    return query_jobs
def get_case_statements():
    """
    This method generates the CASE_STATEMENT query
    """
    case_statements = []
    field_list = []

    schema_list = bq.get_table_schema(IDENTITY_MATCH_TABLE)
    for item in schema_list:
        field_list.append(item.name)

    # this removes the person_id as it is primary key and will not be updated in case statement
    field_list.remove('person_id')
    # this removes algorithm as it is not updated in case statement
    field_list.remove('algorithm')

    for item in field_list:
        ps_api_item = IDENTITY_MATCH_PS_API_FIELD_MAP[item]
        case_statements.append(
            CASE_EXPRESSION.render(identity_match_field=item,
                                   ps_api_field=ps_api_item))

    return ', '.join(case_statements)
Example #20
0
class Observation(object):
    """
    Helper class to initialize test observation rows
    """

    SCHEMA = bq.get_table_schema('observation')
    """List of schema fields for observation table"""

    _FIELD_DEFAULTS = dict(
        (field.name, _default_value_for(field)) for field in SCHEMA)
    """Maps field names to default values"""
    def __init__(self, **kwargs):
        # only permit observation fields as args
        for prop, val in kwargs.items():
            if prop not in Observation._FIELD_DEFAULTS.keys():
                raise ValueError(
                    f'Supplied key {prop} is not a field in the observation table'
                )
            self.__setattr__(prop, val)
        # unset args are set to a (dummy) default value
        for field_name, default_val in Observation._FIELD_DEFAULTS.items():
            if field_name not in kwargs.keys():
                self.__setattr__(field_name, default_val)
    def test_integration_queries_to_retract_from_fake_dataset(
            self, mock_list_datasets, mock_is_ehr_dataset,
            mock_is_unioned_dataset, mock_is_combined_dataset,
            mock_is_deid_dataset):
        mock_list_datasets.return_value = [self.bq_dataset_id]
        mock_is_deid_dataset.return_value = False
        mock_is_combined_dataset.return_value = False
        mock_is_unioned_dataset.return_value = False
        mock_is_ehr_dataset.return_value = True

        # create and load person_ids to pid table
        bq.create_tables(
            self.client,
            self.test_project_id, [
                f'{self.test_project_id}.{self.bq_dataset_id}.{self.pid_table_id}'
            ],
            exists_ok=False,
            fields=[rbq.PID_TABLE_FIELDS])
        bq_formatted_insert_values = ', '.join([
            f'({person_id}, {research_id})'
            for (person_id, research_id) in self.person_research_ids
        ])
        q = INSERT_PID_TABLE.format(
            dataset_id=self.bq_dataset_id,
            pid_table_id=self.pid_table_id,
            person_research_ids=bq_formatted_insert_values)
        job = self.client.query(q)
        job.result()

        row_count_queries = {}
        # load the cdm files into dataset
        for cdm_file in test_util.NYC_FIVE_PERSONS_FILES:
            cdm_file_name = os.path.basename(cdm_file)
            cdm_table = cdm_file_name.split('.')[0]
            hpo_table = f'{self.hpo_id}_{cdm_table}'
            # store query for checking number of rows to delete
            row_count_queries[hpo_table] = EXPECTED_ROWS_QUERY.format(
                dataset_id=self.bq_dataset_id,
                table_id=hpo_table,
                pid_table_id=self.pid_table_id)
            logging.info(
                f'Preparing to load table {self.bq_dataset_id}.{hpo_table}')
            with open(cdm_file, 'rb') as f:
                job_config = bigquery.LoadJobConfig()
                job_config.source_format = bigquery.SourceFormat.CSV
                job_config.skip_leading_rows = 1
                job_config.write_disposition = 'WRITE_EMPTY'
                job_config.schema = bq.get_table_schema(cdm_table)
                load_job = self.client.load_table_from_file(
                    f,
                    f'{self.test_project_id}.{self.bq_dataset_id}.{hpo_table}',
                    job_config=job_config)
                load_job.result()
        logging.info('All tables loaded successfully')

        # use query results to count number of expected row deletions
        expected_row_count = {}
        for table in row_count_queries:
            job = self.client.query(row_count_queries[table])
            result = job.result()
            expected_row_count[table] = result.to_dataframe()['count'].to_list(
            )[0]

        # separate check to find number of actual deleted rows
        q = TABLE_ROWS_QUERY.format(dataset_id=self.bq_dataset_id)
        job = self.client.query(q)
        result = job.result().to_dataframe()
        row_counts_before_retraction = pd.Series(
            result.row_count.values, index=result.table_id).to_dict()

        # perform retraction
        rbq.run_bq_retraction(self.test_project_id, self.bq_dataset_id,
                              self.test_project_id, self.pid_table_id,
                              self.hpo_id, self.dataset_ids,
                              self.retraction_type)

        # find actual deleted rows
        job = self.client.query(q)
        result = job.result().to_dataframe()
        row_counts_after_retraction = pd.Series(
            result.row_count.values, index=result.table_id).to_dict()

        for table in expected_row_count:
            self.assertEqual(
                expected_row_count[table],
                row_counts_before_retraction[table] -
                row_counts_after_retraction[table])
Example #22
0
    def test_get_table_schema(self):
        actual_fields = bq.get_table_schema('digital_health_sharing_status')

        for field in actual_fields:
            if field.field_type.upper() == "RECORD":
                self.assertEqual(len(field.fields), 2)
Example #23
0
def create_rdr_tables(client, rdr_dataset, bucket):
    """
    Create tables from the data in the RDR bucket.

    Uses the client to load data directly from the bucket into
    a table.

    :param client: a bigquery client object
    :param rdr_dataset: The existing dataset to load file data into
    :param bucket: the gcs bucket containing the file data.
    """
    schema_dict = resources.cdm_schemas()
    schema_dict.update(resources.rdr_specific_schemas())

    project = client.project

    for table, schema in schema_dict.items():
        schema_list = bq.get_table_schema(table, schema)
        table_id = f'{project}.{rdr_dataset}.{table}'
        job_config = bigquery.LoadJobConfig(
            schema=schema_list,
            skip_leading_rows=1,
            source_format=bigquery.SourceFormat.CSV,
            field_delimiter=',',
            allow_quoted_newlines=True,
            quote_character='"',
            write_disposition=bigquery.job.WriteDisposition.WRITE_TRUNCATE)
        if table == 'observation_period':
            job_config.allow_jagged_rows = True

        for schema_item in schema_list:
            if 'person_id' in schema_item.name and table.lower(
            ) != 'pid_rid_mapping':
                job_config.clustering_fields = 'person_id'
                job_config.time_partitioning = bigquery.table.TimePartitioning(
                    type_='DAY')

        # path to bucketed csv file
        uri = f'gs://{bucket}/{table}.csv'

        # job_id defined to the second precision
        job_id = f'rdr_load_{table.lower()}_{datetime.now().strftime("%Y%m%d_%H%M%S")}'

        LOGGER.info(f'Loading `{uri}` into `{table_id}`')
        try:
            load_job = client.load_table_from_uri(
                uri, table_id, job_config=job_config,
                job_id=job_id)  # Make an API request.

            load_job.result()  # Waits for the job to complete.
        except NotFound:
            LOGGER.info(
                f'{table} not provided by RDR team.  Creating empty table '
                f'in dataset: `{rdr_dataset}`')

            LOGGER.info(f'Creating empty CDM table, `{table}`')
            destination_table = bigquery.Table(table_id, schema=schema_list)
            destination_table = client.create_table(destination_table)
            LOGGER.info(f'Created empty table `{destination_table.table_id}`')
        else:
            destination_table = client.get_table(
                table_id)  # Make an API request.
        LOGGER.info(f'Loaded {destination_table.num_rows} rows into '
                    f'`{destination_table.table_id}`.')

    LOGGER.info(f"Finished RDR table LOAD from bucket gs://{bucket}")