def store_participant_data(df, project_id, destination_table, schema=None): """ Stores the fetched participant data in a BigQuery dataset. If the table doesn't exist, it will create that table. If the table does exist, it will append the data onto that designated table. :param df: pandas dataframe created to hold participant data fetched from ParticipantSummary API :param project_id: identifies the project :param destination_table: name of the table to be written in the form of dataset.tablename :param schema: a list of SchemaField objects corresponding to the destination table :return: returns the bq job_id for the loading of participant data """ # Parameter check if not isinstance(project_id, str): raise RuntimeError( f'Please specify the project in which to create the tables') client = get_client(project_id) if not schema: schema = get_table_schema(destination_table.split('.')[-1]) # Dataframe data fields must be of type datetime df = set_dataframe_date_fields(df, schema) load_job_config = LoadJobConfig(schema=schema) job = client.load_table_from_dataframe(df, destination_table, job_config=load_job_config) job.result() return job.job_id
def setUp(self) -> None: self.project_id = 'fake_project' self.dataset_id = 'fake_dataset' self.sandbox_dataset_id = 'fake_sandbox' self.observation_schema = get_table_schema('observation') self.cleaning_rule = PpiBranching(self.project_id, self.dataset_id, self.sandbox_dataset_id)
def populate_validation_table(client, project_id, table_id, hpo_id, drc_dataset_id=DRC_OPS): """ Populates validation table with 'missing_rdr' or 'missing_ehr' data. Populated with 'missing_rdr' if data IS NOT found in the ps_values table. Populated with 'missing_ehr' as default. :param client: bq client :param project_id: the project containing the dataset :param table_id: ID for the table :param hpo_id: ID for the HPO site """ schema_list = bq.get_table_schema(IDENTITY_MATCH_TABLE) id_match_table_id = table_id ps_values_table_id = f'{PS_API_VALUES}_{hpo_id}' fields_name_str = ', '.join([item.name for item in schema_list]) populate_query = POPULATE_VALIDATION_TABLE.render( project_id=project_id, drc_dataset_id=drc_dataset_id, id_match_table_id=id_match_table_id, fields=fields_name_str, case_statements=get_case_statements(), ps_values_table_id=ps_values_table_id) job = client.query(populate_query) job.result() LOGGER.info(f'Populated values in `{id_match_table_id}`')
def load(project_id, bq_client, src_dataset_id, dst_dataset_id): """ Transform safely loaded tables and store results in target dataset. :param project_id: Identifies the BQ project :param bq_client: a BigQuery client object :param src_dataset_id: reference to source dataset object :param dst_dataset_id: reference to destination dataset object :return: List of BQ job_ids """ dst_dataset = Dataset(f'{bq_client.project}.{dst_dataset_id}') dst_dataset.description = f'Vocabulary cleaned and loaded from {src_dataset_id}' dst_dataset.labels = {'type': 'vocabulary'} dst_dataset.location = "US" bq_client.create_dataset(dst_dataset, exists_ok=True) src_tables = list(bq_client.list_tables(dataset=src_dataset_id)) job_config = QueryJobConfig() query_jobs = [] for src_table in src_tables: schema = bq.get_table_schema(src_table.table_id) destination = f'{project_id}.{dst_dataset_id}.{src_table.table_id}' table = bq_client.create_table(Table(destination, schema=schema), exists_ok=True) job_config.destination = table query = SELECT_TPL.render(project_id=project_id, dataset_id=src_dataset_id, table=src_table.table_id, fields=schema) query_job = bq_client.query(query, job_config=job_config) LOGGER.info(f'table:{destination} job_id:{query_job.job_id}') query_jobs.append(query_job) query_job.result() return query_jobs
def copy_fitbit_tables_from_views(client, from_dataset, to_dataset, table_prefix): """ Copies tables from views with prefix :param client: bq client :param from_dataset: dataset containing views :param to_dataset: dataset to create tables :param table_prefix: prefix added to table_ids :return: """ for table in FITBIT_TABLES: schema_list = bq.get_table_schema(table) fq_dest_table = f'{client.project}.{to_dataset}.{table}' dest_table = Table(fq_dest_table, schema=schema_list) dest_table = client.create_table(dest_table) LOGGER.info(f'Created empty table {fq_dest_table}') fields_name_str = ',\n'.join([item.name for item in schema_list]) fields_casted_str = ',\n'.join([ cast_to_schema_type(item.name, item.field_type) for item in schema_list ]) content_query = INSERT_QUERY.render(fq_dest_table=fq_dest_table, fields=fields_name_str, fields_casted=fields_casted_str, client=client, from_dataset=from_dataset, table_prefix=table_prefix, table=table) job = client.query(content_query) job.result() LOGGER.info(f'Copied fitbit tables from `{from_dataset}` to `{to_dataset}`')
def safe_schema_for(table: str) -> List[SchemaField]: """ Get schema fields whose date[time] fields are converted to strings so load will work :param table: name of the table :return: a list of SchemaField objects """ return [ SchemaField( f.name, 'string' if f.field_type.lower() in DATE_TIME_TYPES else f.field_type, f.mode, f.description) for f in bq.get_table_schema(table) ]
def main(project_id, rdr_project_id, org_id=None, hpo_id=None, dataset_id=DRC_OPS): #Get list of hpos LOGGER.info('Getting hpo list...') if org_id: hpo_list = [{"hpo_id": hpo_id, "org_id": org_id}] else: hpo_list = get_hpo_info(project_id) LOGGER.info(hpo_list) for hpo in hpo_list: org_id = hpo['org_id'] hpo_id = hpo['hpo_id'] # Get participant summary data LOGGER.info(f'Getting participant summary data for {org_id}...') participant_info = get_org_participant_information( rdr_project_id, org_id) # Load schema and create ingestion time-partitioned table schema = bq.get_table_schema(PS_API_VALUES) tablename = f'{PS_API_VALUES}_{hpo_id}' client = bq.get_client(project_id) try: table = client.get_table(f'{project_id}.{dataset_id}.{tablename}') except NotFound: LOGGER.info( f'Creating table {project_id}.{dataset_id}.{tablename}...') table = bigquery.Table(f'{project_id}.{dataset_id}.{tablename}', schema=schema) table.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.HOUR) table = client.create_table(table) # Insert summary data into table LOGGER.info( f'Storing participant data for {org_id} in table {project_id}.{dataset_id}.{tablename}...' ) store_participant_data(participant_info, project_id, f'{dataset_id}.{tablename}', schema=schema) LOGGER.info(f'Done.')
def store_digital_health_status_data(project_id, json_data, destination_table, schema=None): """ Stores the fetched digital_health_sharing_status data in a BigQuery dataset. If the table doesn't exist, it will create that table. If the table does exist, it will create a partition in the designated table or append to the same partition. This is necessary for storing data has "RECORD" type fields which do not conform to a dataframe. The data is stored using a JSON file object since it is one of the ways BigQuery expects it. :param project_id: identifies the project :param json_data: list of json objects retrieved from process_digital_health_data_to_json :param destination_table: fully qualified destination table name as 'project.dataset.table' :param schema: a list of SchemaField objects corresponding to the destination table :return: returns the bq job_id for the loading of digital health data """ # Parameter check if not isinstance(project_id, str): raise RuntimeError( f'Please specify the project in which to create the table') client = get_client(project_id) if not schema: schema = get_table_schema(DIGITAL_HEALTH_SHARING_STATUS) try: table = client.get_table(destination_table) except NotFound: table = Table(destination_table, schema=schema) table.time_partitioning = TimePartitioning( type_=TimePartitioningType.DAY) table = client.create_table(table) file_obj = StringIO() for json_obj in json_data: json.dump(json_obj, file_obj) file_obj.write('\n') job_config = LoadJobConfig( source_format=SourceFormat.NEWLINE_DELIMITED_JSON, schema=schema) job = client.load_table_from_file(file_obj, table, rewind=True, job_config=job_config, job_id_prefix='ps_digital_health_load_') job.result() return job.job_id
def copy_vocab_tables(cls, vocabulary_id): """ A function for copying the vocab tables to the test dataset_id :param vocabulary_id: :return: """ # Copy vocab tables over to the test dataset vocabulary_dataset = cls.client.get_dataset(vocabulary_id) for src_table in bq.list_tables(cls.client, vocabulary_dataset): schema = bq.get_table_schema(src_table.table_id) destination = f'{cls.project_id}.{cls.dataset_id}.{src_table.table_id}' dst_table = cls.client.create_table(Table(destination, schema=schema), exists_ok=True) cls.client.copy_table(src_table, dst_table)
def backup_rows_to_drop_ddl(self) -> str: """ Get a DDL statement which loads a backup table with rows to be dropped :return: the DDL statement """ observation_schema = bq.get_table_schema(OBSERVATION) query = BACKUP_ROWS_QUERY.render(lookup_table=self.lookup_table, src_table=self.observation_table) return bq.get_create_or_replace_table_ddl( project_id=self.backup_table.project, dataset_id=self.backup_table.dataset_id, table_id=self.backup_table.table_id, schema=observation_schema, as_query=query)
def stage_to_target_ddl(self) -> str: """ Get a DDL statement which drops and creates the observation table with rows from stage :return: the DDL statement """ observation_schema = bq.get_table_schema(OBSERVATION) stage = self.stage_table query = f'''SELECT * FROM `{stage.project}.{stage.dataset_id}.{stage.table_id}`''' return bq.get_create_or_replace_table_ddl( project_id=self.observation_table.project, dataset_id=self.observation_table.dataset_id, schema=observation_schema, table_id=self.observation_table.table_id, as_query=query)
def setUp(self): self.project_id = os.environ.get(PROJECT_ID) self.dataset_id = os.environ.get('COMBINED_DATASET_ID') self.dataset_ref = DatasetReference(self.project_id, self.dataset_id) self.client = bq.get_client(self.project_id) self.schema = [ SchemaField("person_id", "INT64"), SchemaField("first_name", "STRING"), SchemaField("last_name", "STRING"), SchemaField("algorithm", "STRING") ] self.ps_api_fields = [ dict(name='person_id', type='integer', mode='nullable'), dict(name='first_name', type='string', mode='nullable'), dict(name='last_name', type='string', mode='nullable') ] self.id_match_fields = [ dict(name='person_id', type='integer', mode='nullable'), dict(name='first_name', type='string', mode='nullable'), dict(name='last_name', type='string', mode='nullable'), dict(name='algorithm', type='string', mode='nullable') ] self.hpo_id = 'fake_site' self.id_match_table_id = f'{IDENTITY_MATCH_TABLE}_{self.hpo_id}' self.ps_values_table_id = f'ps_api_values_{self.hpo_id}' # Create and populate the ps_values site table schema = bq.get_table_schema(PS_API_VALUES) tablename = self.ps_values_table_id table = Table(f'{self.project_id}.{self.dataset_id}.{tablename}', schema=schema) table.time_partitioning = TimePartitioning( type_=TimePartitioningType.HOUR) table = self.client.create_table(table) populate_query = POPULATE_PS_VALUES.render( project_id=self.project_id, drc_dataset_id=self.dataset_id, ps_values_table_id=self.ps_values_table_id) job = self.client.query(populate_query) job.result()
def load_test_data(self, df, project_id, dataset_id, table): """ Add data to the tables for the rule to run on. :param df: a dataframe containing data to insert :param project_id :param dataset_id :param table """ client = get_client(project_id) schema = get_table_schema(table) schema = [field for field in schema if field.name in list(df.columns)] load_job_config = LoadJobConfig(schema=schema) load_job = client.load_table_from_dataframe(df, f'{dataset_id}.{table}', job_config=load_job_config) load_job.result()
def stage_cleaned_table_ddl(self) -> str: """ Get a DDL statement which stages cleaned table Note: This avoids potential partitioning mismatch error when directly overwriting observation table :return: the DDL statement """ observation_schema = bq.get_table_schema(OBSERVATION) query = CLEANED_ROWS_QUERY.render(src=self.observation_table, backup=self.backup_table) return bq.get_create_or_replace_table_ddl( project_id=self.stage_table.project, dataset_id=self.stage_table.dataset_id, table_id=self.stage_table.table_id, schema=observation_schema, as_query=query)
def get_query_specs(self, *args, **keyword_args) -> query_spec_list: """ Return a list of dictionary query specifications. :return: A list of dictionaries. Each dictionary contains a single query and a specification for how to execute that query. The specifications are optional but the query is required. """ queries = [] for table in self.affected_tables: schema = bq.get_table_schema(table) statements = [] for item in schema: if item.name in fields: if item.mode.lower() == 'nullable': value = 'NULL' elif item.field_type.lower() == 'integer': value = 0 elif item.field_type.lower() == 'string': value = '' else: raise RuntimeError( f"Required field {item.name} needs to be integer or string type to be replaced" ) suppression_statement = REPLACE_STRING.render( suppression_statement=value, field=item.name) statements.append(suppression_statement) if statements: suppression_statement = ', '.join(statements) query = dict() query[cdr_consts.QUERY] = ID_FIELD_SUPPRESSION_QUERY.render( project_id=self.project_id, dataset_id=self.dataset_id, table=table, replace_statement=suppression_statement) query[cdr_consts.DESTINATION_TABLE] = table query[cdr_consts.DISPOSITION] = bq_consts.WRITE_TRUNCATE query[cdr_consts.DESTINATION_DATASET] = self.dataset_id queries.append(query) else: continue return queries
def load_folder(dst_dataset: str, bq_client: BQClient, bucket_name: str, prefix: str, gcs_client: GCSClient, hpo_id: str) -> List[LoadJob]: """ Stage files from a bucket to a dataset :param dst_dataset: Identifies the destination dataset :param bq_client: a BigQuery client object :param bucket_name: the bucket in GCS containing the archive files :param prefix: prefix of the filepath URI :param gcs_client: a Cloud Storage client object :param hpo_id: Identifies the HPO site :return: list of completed load jobs """ blobs = list(gcs_client.list_blobs(bucket_name, prefix=prefix)) load_jobs = [] for blob in blobs: table_name = _filename_to_table_name(blob.name) if table_name not in AOU_REQUIRED: LOGGER.debug(f'Skipping file for {table_name}') continue schema = get_table_schema(table_name) hpo_table_name = f'{hpo_id}_{table_name}' fq_hpo_table = f'{bq_client.project}.{dst_dataset}.{hpo_table_name}' destination = Table(fq_hpo_table, schema=schema) destination = bq_client.create_table(destination) job_config = LoadJobConfig() job_config.schema = schema job_config.skip_leading_rows = 1 job_config.source_format = 'CSV' source_uri = f'gs://{bucket_name}/{blob.name}' load_job = bq_client.load_table_from_uri( source_uri, destination, job_config=job_config, job_id_prefix=f"{__file__.split('/')[-1].split('.')[0]}_") LOGGER.info(f'table:{destination} job_id:{load_job.job_id}') load_jobs.append(load_job) load_job.result() return load_jobs
def test_get_table_ddl(self): # Schema is determined by table name ddl = bq.get_create_or_replace_table_ddl(self.project_id, self.dataset_id, 'observation').strip() self.assertTrue( ddl.startswith( f'CREATE OR REPLACE TABLE `{self.project_id}.{self.dataset_id}.observation`' )) self.assertTrue(ddl.endswith(')')) # Explicitly provided table name and schema are rendered observation_schema = bq.get_table_schema('observation') ddl = bq.get_create_or_replace_table_ddl( self.project_id, self.dataset_id, table_id='custom_observation', schema=observation_schema).strip() self.assertTrue( ddl.startswith( f'CREATE OR REPLACE TABLE `{self.project_id}.{self.dataset_id}.custom_observation`' )) # Sanity check that observation schema is rendered self.assertTrue( all(field.description in ddl for field in observation_schema)) self.assertTrue(ddl.endswith(')')) # Parameter as_query is rendered fake_as_query = "SELECT 1 FROM fake" ddl = bq.get_create_or_replace_table_ddl( self.project_id, self.dataset_id, 'observation', as_query=fake_as_query).strip() self.assertTrue( ddl.startswith( f'CREATE OR REPLACE TABLE `{self.project_id}.{self.dataset_id}.observation`' )) self.assertTrue(ddl.endswith(fake_as_query))
def load(project_id, bq_client, src_dataset_id, dst_dataset_id, overwrite_ok=False): """ Transform safely loaded tables and store results in target dataset. :param project_id: :param bq_client: :param src_dataset_id: :param dst_dataset_id: :param overwrite_ok: if True and the dest dataset already exists the dataset is recreated :return: """ if overwrite_ok: bq_client.delete_dataset(dst_dataset_id, delete_contents=True, not_found_ok=True) bq_client.create_dataset(dst_dataset_id) src_tables = list(bq_client.list_tables(dataset=src_dataset_id)) job_config = QueryJobConfig() query_jobs = [] for src_table in src_tables: schema = bq.get_table_schema(src_table.table_id) destination = f'{project_id}.{dst_dataset_id}.{src_table.table_id}' table = bq_client.create_table(Table(destination, schema=schema), exists_ok=True) job_config.destination = table query = SELECT_TPL.render(project_id=project_id, dataset_id=src_dataset_id, table=src_table.table_id, fields=schema) query_job = bq_client.query(query, job_config=job_config) LOGGER.info(f'table:{destination} job_id:{query_job.job_id}') query_jobs.append(query_job) return query_jobs
def get_case_statements(): """ This method generates the CASE_STATEMENT query """ case_statements = [] field_list = [] schema_list = bq.get_table_schema(IDENTITY_MATCH_TABLE) for item in schema_list: field_list.append(item.name) # this removes the person_id as it is primary key and will not be updated in case statement field_list.remove('person_id') # this removes algorithm as it is not updated in case statement field_list.remove('algorithm') for item in field_list: ps_api_item = IDENTITY_MATCH_PS_API_FIELD_MAP[item] case_statements.append( CASE_EXPRESSION.render(identity_match_field=item, ps_api_field=ps_api_item)) return ', '.join(case_statements)
class Observation(object): """ Helper class to initialize test observation rows """ SCHEMA = bq.get_table_schema('observation') """List of schema fields for observation table""" _FIELD_DEFAULTS = dict( (field.name, _default_value_for(field)) for field in SCHEMA) """Maps field names to default values""" def __init__(self, **kwargs): # only permit observation fields as args for prop, val in kwargs.items(): if prop not in Observation._FIELD_DEFAULTS.keys(): raise ValueError( f'Supplied key {prop} is not a field in the observation table' ) self.__setattr__(prop, val) # unset args are set to a (dummy) default value for field_name, default_val in Observation._FIELD_DEFAULTS.items(): if field_name not in kwargs.keys(): self.__setattr__(field_name, default_val)
def test_integration_queries_to_retract_from_fake_dataset( self, mock_list_datasets, mock_is_ehr_dataset, mock_is_unioned_dataset, mock_is_combined_dataset, mock_is_deid_dataset): mock_list_datasets.return_value = [self.bq_dataset_id] mock_is_deid_dataset.return_value = False mock_is_combined_dataset.return_value = False mock_is_unioned_dataset.return_value = False mock_is_ehr_dataset.return_value = True # create and load person_ids to pid table bq.create_tables( self.client, self.test_project_id, [ f'{self.test_project_id}.{self.bq_dataset_id}.{self.pid_table_id}' ], exists_ok=False, fields=[rbq.PID_TABLE_FIELDS]) bq_formatted_insert_values = ', '.join([ f'({person_id}, {research_id})' for (person_id, research_id) in self.person_research_ids ]) q = INSERT_PID_TABLE.format( dataset_id=self.bq_dataset_id, pid_table_id=self.pid_table_id, person_research_ids=bq_formatted_insert_values) job = self.client.query(q) job.result() row_count_queries = {} # load the cdm files into dataset for cdm_file in test_util.NYC_FIVE_PERSONS_FILES: cdm_file_name = os.path.basename(cdm_file) cdm_table = cdm_file_name.split('.')[0] hpo_table = f'{self.hpo_id}_{cdm_table}' # store query for checking number of rows to delete row_count_queries[hpo_table] = EXPECTED_ROWS_QUERY.format( dataset_id=self.bq_dataset_id, table_id=hpo_table, pid_table_id=self.pid_table_id) logging.info( f'Preparing to load table {self.bq_dataset_id}.{hpo_table}') with open(cdm_file, 'rb') as f: job_config = bigquery.LoadJobConfig() job_config.source_format = bigquery.SourceFormat.CSV job_config.skip_leading_rows = 1 job_config.write_disposition = 'WRITE_EMPTY' job_config.schema = bq.get_table_schema(cdm_table) load_job = self.client.load_table_from_file( f, f'{self.test_project_id}.{self.bq_dataset_id}.{hpo_table}', job_config=job_config) load_job.result() logging.info('All tables loaded successfully') # use query results to count number of expected row deletions expected_row_count = {} for table in row_count_queries: job = self.client.query(row_count_queries[table]) result = job.result() expected_row_count[table] = result.to_dataframe()['count'].to_list( )[0] # separate check to find number of actual deleted rows q = TABLE_ROWS_QUERY.format(dataset_id=self.bq_dataset_id) job = self.client.query(q) result = job.result().to_dataframe() row_counts_before_retraction = pd.Series( result.row_count.values, index=result.table_id).to_dict() # perform retraction rbq.run_bq_retraction(self.test_project_id, self.bq_dataset_id, self.test_project_id, self.pid_table_id, self.hpo_id, self.dataset_ids, self.retraction_type) # find actual deleted rows job = self.client.query(q) result = job.result().to_dataframe() row_counts_after_retraction = pd.Series( result.row_count.values, index=result.table_id).to_dict() for table in expected_row_count: self.assertEqual( expected_row_count[table], row_counts_before_retraction[table] - row_counts_after_retraction[table])
def test_get_table_schema(self): actual_fields = bq.get_table_schema('digital_health_sharing_status') for field in actual_fields: if field.field_type.upper() == "RECORD": self.assertEqual(len(field.fields), 2)
def create_rdr_tables(client, rdr_dataset, bucket): """ Create tables from the data in the RDR bucket. Uses the client to load data directly from the bucket into a table. :param client: a bigquery client object :param rdr_dataset: The existing dataset to load file data into :param bucket: the gcs bucket containing the file data. """ schema_dict = resources.cdm_schemas() schema_dict.update(resources.rdr_specific_schemas()) project = client.project for table, schema in schema_dict.items(): schema_list = bq.get_table_schema(table, schema) table_id = f'{project}.{rdr_dataset}.{table}' job_config = bigquery.LoadJobConfig( schema=schema_list, skip_leading_rows=1, source_format=bigquery.SourceFormat.CSV, field_delimiter=',', allow_quoted_newlines=True, quote_character='"', write_disposition=bigquery.job.WriteDisposition.WRITE_TRUNCATE) if table == 'observation_period': job_config.allow_jagged_rows = True for schema_item in schema_list: if 'person_id' in schema_item.name and table.lower( ) != 'pid_rid_mapping': job_config.clustering_fields = 'person_id' job_config.time_partitioning = bigquery.table.TimePartitioning( type_='DAY') # path to bucketed csv file uri = f'gs://{bucket}/{table}.csv' # job_id defined to the second precision job_id = f'rdr_load_{table.lower()}_{datetime.now().strftime("%Y%m%d_%H%M%S")}' LOGGER.info(f'Loading `{uri}` into `{table_id}`') try: load_job = client.load_table_from_uri( uri, table_id, job_config=job_config, job_id=job_id) # Make an API request. load_job.result() # Waits for the job to complete. except NotFound: LOGGER.info( f'{table} not provided by RDR team. Creating empty table ' f'in dataset: `{rdr_dataset}`') LOGGER.info(f'Creating empty CDM table, `{table}`') destination_table = bigquery.Table(table_id, schema=schema_list) destination_table = client.create_table(destination_table) LOGGER.info(f'Created empty table `{destination_table.table_id}`') else: destination_table = client.get_table( table_id) # Make an API request. LOGGER.info(f'Loaded {destination_table.num_rows} rows into ' f'`{destination_table.table_id}`.') LOGGER.info(f"Finished RDR table LOAD from bucket gs://{bucket}")