def setUp(self) -> None: self.project_id = "fake-recidiviz-project" self.mock_view_dataset_name = "my_views_dataset" self.mock_dataset = bigquery.dataset.DatasetReference( self.project_id, self.mock_view_dataset_name) self.project_id_patcher = patch("recidiviz.utils.metadata.project_id") self.project_id_patcher.start().return_value = self.project_id self.project_number_patcher = patch( "recidiviz.utils.metadata.project_number") self.project_number_patcher.start().return_value = "123456789" self.bq_client_patcher = mock.patch( "recidiviz.calculator.dataflow_metric_table_manager.BigQueryClientImpl" ) self.mock_client = self.bq_client_patcher.start().return_value self.mock_client.dataset_ref_for_id.return_value = self.mock_dataset self.mock_dataflow_tables = [ bigquery.TableReference(self.mock_dataset, "fake_table_1"), bigquery.TableReference(self.mock_dataset, "fake_table_2"), ] self.mock_client.list_tables.return_value = self.mock_dataflow_tables self.mock_client.project_id.return_value = self.project_id self.fake_dataflow_metrics_dataset = "fake_dataflow_metrics_dataset" self.fake_cold_storage_dataset = "fake_cold_storage_dataset" self.fake_max_jobs = 10
def __init__(self, project_id, dataset_id, sandbox_dataset_id): desc = ( 'Load a lookup of PPI branching rules represented in CSV files. ' 'Store observation rows that violate the rules in a sandbox table. ' 'Stage the cleaned rows in a sandbox table. ' 'Drop and create the observation table with rows from stage.') # TODO add depends_on after base classing manual_cleaning_rules.update_questiona_answers_not_mapped_to_omop super().__init__(issue_numbers=ISSUE_NUMBERS, description=desc, affected_datasets=[cdr_consts.RDR], project_id=project_id, dataset_id=dataset_id, sandbox_dataset_id=sandbox_dataset_id, affected_tables=[OBSERVATION]) dataset_ref = bigquery.DatasetReference(project_id, dataset_id) sandbox_dataset_ref = bigquery.DatasetReference( project_id, sandbox_dataset_id) self.rule_paths = PPI_BRANCHING_RULE_PATHS self.observation_table = bigquery.Table( bigquery.TableReference(dataset_ref, OBSERVATION)) self.lookup_table = bigquery.TableReference(sandbox_dataset_ref, RULES_LOOKUP_TABLE_ID) self.backup_table = bigquery.TableReference( sandbox_dataset_ref, OBSERVATION_BACKUP_TABLE_ID) self.stage_table = bigquery.TableReference(sandbox_dataset_ref, OBSERVATION_STAGE_TABLE_ID)
def setUp(self) -> None: self.project_id = "fake-recidiviz-project" self.mock_view_dataset_name = "my_views_dataset" self.mock_dataset = bigquery.dataset.DatasetReference( self.project_id, self.mock_view_dataset_name) self.mock_dataset_resource = { "datasetReference": { "projectId": self.project_id, "datasetId": self.mock_view_dataset_name, }, } self.mock_table_resource = { "tableReference": { "projectId": self.project_id, "datasetId": self.mock_view_dataset_name, "tableId": "fake_table", }, } self.project_id_patcher = patch("recidiviz.utils.metadata.project_id") self.project_id_patcher.start().return_value = self.project_id self.project_number_patcher = patch( "recidiviz.utils.metadata.project_number") self.project_number_patcher.start().return_value = "123456789" self.bq_client_patcher = mock.patch( "recidiviz.calculator.calculation_data_storage_manager.BigQueryClientImpl" ) self.mock_client = self.bq_client_patcher.start().return_value self.mock_client.dataset_ref_for_id.return_value = self.mock_dataset self.mock_dataflow_tables = [ bigquery.TableReference(self.mock_dataset, "fake_table_1"), bigquery.TableReference(self.mock_dataset, "fake_table_2"), ] self.mock_client.list_tables.return_value = self.mock_dataflow_tables self.mock_client.project_id.return_value = self.project_id self.data_storage_config_patcher = mock.patch( "recidiviz.calculator.dataflow_output_storage_config") self.mock_data_storage_config = self.data_storage_config_patcher.start( ) self.fake_dataflow_metrics_dataset = "fake_dataflow_metrics_dataset" self.fake_cold_storage_dataset = "fake_cold_storage_dataset" self.fake_max_jobs = 10 self.mock_data_storage_config.MAX_DAYS_IN_DATAFLOW_METRICS_TABLE.return_value = ( self.fake_max_jobs) self.mock_data_storage_config.DATAFLOW_METRICS_COLD_STORAGE_DATASET = ( self.fake_cold_storage_dataset) app = Flask(__name__) app.register_blueprint(calculation_data_storage_manager_blueprint) app.config["TESTING"] = True self.client = app.test_client()
def patched_get_table(self, table_id): if table_id == self.source_table_id: return bigquery.Table( bigquery.TableReference( bigquery.DatasetReference('project_name', 'dataset_name'), 'source_table'), schema=[ bigquery.SchemaField( 'clientId', 'STRING', 'NULLABLE', ('Unhashed version of the Client ID for a' ' given user associated with any given visit/session.' )), bigquery.SchemaField( 'fullVisitorId', 'STRING', 'NULLABLE', 'The unique visitor ID (also known as client ID).'), bigquery.SchemaField( 'visitNumber', 'STRING', 'NULLABLE', ('The session number for this user. ' 'If this is the first session, then this is set to 1.' )), bigquery.SchemaField( 'totals', 'RECORD', 'NULLABLE', 'This section contains aggregate values across the session.', [ bigquery.SchemaField( 'hits', 'INTEGER', 'NULLABLE', 'Total number of hits within the session.'), bigquery.SchemaField( 'pageviews', 'INTEGER', 'NULLABLE', 'Total number of pageviews within the session.' ), bigquery.SchemaField( 'screenviews', 'INTEGER', 'NULLABLE', 'Total number of screenviews within the session.' ) ]) ]) elif table_id == self.target_table_id: return bigquery.Table( bigquery.TableReference( bigquery.DatasetReference('other_project_name', 'other_dataset_name'), 'target_table'), schema=[ bigquery.SchemaField('clientId', 'STRING', 'NULLABLE'), bigquery.SchemaField('fullVisitorId', 'STRING', 'NULLABLE'), bigquery.SchemaField( 'totals', 'RECORD', 'NULLABLE', fields=[ bigquery.SchemaField('hits', 'INTEGER', 'NULLABLE'), bigquery.SchemaField('pageviews', 'INTEGER', 'NULLABLE') ]) ])
def setUp(self): super(ResultRecorderTest, self).setUp() self.client = _build_mock_client() dataset_reference = bigquery.DatasetReference(_PROJECT_ID, _DATASET_ID) self.count_results_table_reference = bigquery.TableReference( dataset_reference, _TABLE_ID_FOR_COUNTS) self.id_results_table_reference = bigquery.TableReference( dataset_reference, _TABLE_ID_FOR_RESULTS) self.recorder = result_recorder.ResultRecorder( client=self.client, count_results_table_reference=self.count_results_table_reference, item_results_table_reference=self.id_results_table_reference)
def __init__(self, project_id, dataset_id, sandbox_dataset_id, export_date=None, namer=None): desc = (f'All new pid/rid mappings will be identified via SQL and ' f'stored, along with a shift integer, in a sandbox table. ' f'The table will be read to load into the primary pipeline ' f'table, pipeline_tables.primary_pid_rid_mapping.') namer = dataset_id if not namer else namer super().__init__(issue_numbers=['DC1543'], description=desc, affected_datasets=[cdr_consts.RDR], project_id=project_id, dataset_id=dataset_id, sandbox_dataset_id=sandbox_dataset_id, table_namer=namer) # primary table ref dataset_ref = bigquery.DatasetReference(project_id, PIPELINE_TABLES) self.primary_mapping_table = bigquery.TableReference( dataset_ref, PRIMARY_PID_RID_MAPPING) # rdr table ref dataset_ref = bigquery.DatasetReference(project_id, dataset_id) self.rdr_table = bigquery.TableReference(dataset_ref, PID_RID_MAPPING) # rdr sandbox table ref dataset_ref = bigquery.DatasetReference(project_id, sandbox_dataset_id) self.rdr_sandbox_table = bigquery.TableReference( dataset_ref, self.sandbox_table_for(PID_RID_MAPPING)) # store fields as json object self.fields = fields_for(PRIMARY_PID_RID_MAPPING, sub_path='pipeline_tables') # set export date try: self.export_date = validate_bq_date_string(export_date) LOGGER.info(f'Using provided export_date: `{export_date}`') except (TypeError, ValueError): # otherwise, default to using today's date LOGGER.warning( f"Failed to validate the export_date: '{export_date}'") self.export_date = datetime.now().strftime('%Y-%m-%d') LOGGER.warning(f"Setting export_date to now: '{self.export_date}'")
def test_add_missing_fields_to_schema_fields_with_same_name_different_mode( self): """Tests that the add_missing_fields_to_schema function raises an error when the user is trying to add a field with the same name but different mode as an existing field.""" table_ref = bigquery.TableReference(self.mock_dataset, self.mock_table_id) schema_fields = [ bigquery.SchemaField('fake_schema_field', 'STRING', mode="NULLABLE") ] table = bigquery.Table(table_ref, schema_fields) self.mock_client.get_table.return_value = table new_schema_fields = [ bigquery.SchemaField('fake_schema_field', 'STRING', mode="REQUIRED") ] with pytest.raises(ValueError): self.bq_client.add_missing_fields_to_schema( self.mock_dataset_id, self.mock_table_id, new_schema_fields) self.mock_client.update_table.assert_not_called()
def test_upload_data_tokyo_non_existing_dataset(self, project_id, random_dataset_id, bigquery_client): from google.cloud import bigquery test_size = 10 df = make_mixed_dataframe_v2(test_size) non_existing_tokyo_dataset = random_dataset_id non_existing_tokyo_destination = "{}.to_gbq_test".format( non_existing_tokyo_dataset) # Initialize table with sample data gbq.to_gbq( df, non_existing_tokyo_destination, project_id, credentials=self.credentials, location="asia-northeast1", ) table = bigquery_client.get_table( bigquery.TableReference( bigquery.DatasetReference(project_id, non_existing_tokyo_dataset), "to_gbq_test", )) assert table.num_rows > 0
def create_external_tables(self, bucket_name, prefix, date, tables, project, dataset, table_prefix, version): if table_prefix: table_prefix += "_" else: table_prefix = "" gcs_loc = f"gs://{bucket_name}/{prefix}/v{version}/{date}" client = bigquery.Client(project=project) dataset_ref = client.dataset(dataset) for leanplum_name in tables: table_name = f"{table_prefix}{leanplum_name}_v{version}_{date}" logging.info(f"Creating table {table_name}") table_ref = bigquery.TableReference(dataset_ref, table_name) table = bigquery.Table(table_ref) client.delete_table(table, not_found_ok=True) external_config = bigquery.ExternalConfig('CSV') external_config.source_uris = [f"{gcs_loc}/{leanplum_name}/*"] external_config.autodetect = True table.external_data_configuration = external_config client.create_table(table)
def test_bq_sql_aggregate_integer(self): project = 'project' dataset_id = 'dataset_id' table_id = 'table_id' name = 'name' i = 0 dataset_ref = bigquery.DatasetReference(project, dataset_id) table_ref = bigquery.TableReference(dataset_ref, table_id) f = bigquery.SchemaField(name, 'INTEGER') expected = ( "SELECT 'name' name, 'INTEGER' type, COUNT(name) count, AVG(name) average, " 'STDDEV_SAMP(name) std, CAST(MAX(name) AS STRING) max, CAST(MIN(name) AS ' 'STRING) min, CAST(APPROX_TOP_COUNT(name, 1)[ORDINAL(1)].value AS STRING) ' 'mode, COUNT(*) - COUNT(name) miss_count, SAFE_DIVIDE(COUNT(*) - COUNT(name), ' 'COUNT(*)) miss_rate, 0 miss_days, COUNT(DISTINCT name) unique_count, ' 'SAFE_DIVIDE(COUNT(DISTINCT name), COUNT(name)) unique_rate, ' 'CAST(APPROX_QUANTILES(name, 4)[ORDINAL(2)] AS STRING) quantile4_1, ' 'CAST(APPROX_QUANTILES(name, 4)[ORDINAL(3)] AS STRING) median, ' 'CAST(APPROX_QUANTILES(name, 4)[ORDINAL(4)] AS STRING) quantile4_3, ' 'CAST(APPROX_QUANTILES(name, 100)[ORDINAL(2)] AS STRING) quantile100_1, ' 'CAST(APPROX_QUANTILES(name, 100)[ORDINAL(100)] AS STRING) quantile100_99, ' 'COUNTIF(name >= 0) not_negatives, COUNTIF(name = 0) zeros, 0 empty_strings, ' '0 ord FROM dataset_id.table_id') actual = bq_sql.aggregate(f, table_ref, i, '""') self.assertEqual(actual, expected)
def create_bq_view_of_joined_features_and_entities( source: BigQuerySource, entity_source: BigQuerySource, entity_names: List[str]) -> BigQuerySource: """ Creates BQ view that joins tables from `source` and `entity_source` with join key derived from `entity_names`. Returns BigQuerySource with reference to created view. """ bq_client = bigquery.Client() source_ref = table_reference_from_string(source.bigquery_options.table_ref) entities_ref = table_reference_from_string( entity_source.bigquery_options.table_ref) destination_ref = bigquery.TableReference( bigquery.DatasetReference(source_ref.project, source_ref.dataset_id), f"_view_{source_ref.table_id}_{datetime.now():%Y%m%d%H%M%s}", ) view = bigquery.Table(destination_ref) view.view_query = JOIN_TEMPLATE.format( entities=entities_ref, source=source_ref, entity_key=" AND ".join( [f"source.{e} = entities.{e}" for e in entity_names]), ) view.expires = datetime.now() + timedelta(days=1) bq_client.create_table(view) return BigQuerySource( event_timestamp_column=source.event_timestamp_column, created_timestamp_column=source.created_timestamp_column, table_ref=f"{view.project}:{view.dataset_id}.{view.table_id}", field_mapping=source.field_mapping, date_partition_column=source.date_partition_column, )
def upload_csv_data_to_bq_table(client, dataset_id, table_name, fq_file_path, write_disposition): """ Uploads data from local csv file to bigquery table :param client: an instantiated bigquery client object :param dataset_id: identifies the dataset :param table_name: identifies the table name where data needs to be uploaded :param fq_file_path: Fully qualified path to the csv file which needs to be uploaded :param write_disposition: Write disposition for job choose b/w write_empty, write_append, write_truncate :return: job result """ dataset_ref = bigquery.DatasetReference(client.project, dataset_id) table_ref = bigquery.TableReference(dataset_ref, table_name) job_config = bigquery.LoadJobConfig() job_config.source_format = bigquery.SourceFormat.CSV job_config.skip_leading_rows = 1 job_config.autodetect = True job_config.write_disposition = write_disposition LOGGER.info(f"Uploading {fq_file_path} data to {dataset_id}.{table_name}") with open(fq_file_path, "rb") as source_file: job = client.load_table_from_file(source_file, table_ref, job_config=job_config) try: result = job.result() # Waits for table load to complete. except (BadRequest, OSError, AttributeError, TypeError, ValueError) as exp: message = f"Unable to load data to table {table_name}" LOGGER.exception(message) raise exp return result
def stage_entities_to_bq(entity_source: pd.DataFrame, project: str, dataset: str) -> BigQuerySource: """ Stores given (entity) dataframe as new table in BQ. Name of the table generated based on current time. Table will expire in 1 day. Returns BigQuerySource with reference to created table. """ bq_client = bigquery.Client() destination = bigquery.TableReference( bigquery.DatasetReference(project, dataset), f"_entities_{datetime.now():%Y%m%d%H%M%s}", ) # prevent casting ns -> ms exception inside pyarrow entity_source["event_timestamp"] = entity_source[ "event_timestamp"].dt.floor("ms") load_job: bigquery.LoadJob = bq_client.load_table_from_dataframe( entity_source, destination) load_job.result() # wait until complete dest_table: bigquery.Table = bq_client.get_table(destination) dest_table.expires = datetime.now() + timedelta(days=1) bq_client.update_table(dest_table, fields=["expires"]) return BigQuerySource( event_timestamp_column="event_timestamp", table_ref= f"{destination.project}:{destination.dataset_id}.{destination.table_id}", )
def setUp(self): self.project_id = 'foo_project' self.dataset_id = 'foo_dataset' self.sandbox_dataset_id = 'foo_sandbox' self.mapping_dataset_id = 'foo_mapping_dataset' self.client = None self.maxDiff = None self.hpo_list = [{ "hpo_id": "hpo_1", "name": "hpo_name_1" }, { "hpo_id": "hpo_2", "name": "hpo_name_2" }] self.fields = [{ "type": "integer", "name": "foo_id", "mode": "nullable", "description": "The foo_id used in the foo table." }, { "type": "string", "name": "src_id", "mode": "nullable", "description": "The provenance of the data associated with the foo_id." }] mapped_table_names = [ cdm_table for cdm_table in common.AOU_REQUIRED if cdm_table not in [common.PERSON, common.DEATH, common.FACT_RELATIONSHIP] ] mapping_table_names = [ gen_ext.MAPPING_PREFIX + cdm_table for cdm_table in mapped_table_names ] dataset_ref = bigquery.DatasetReference(self.project_id, self.dataset_id) self.mapping_table_objs = [ bigquery.TableReference(dataset_ref, table_name) for table_name in mapping_table_names + mapped_table_names ] self.rule_instance = gen_ext.GenerateExtTables(self.project_id, self.dataset_id, self.sandbox_dataset_id, self.mapping_dataset_id) self.assertEqual(self.rule_instance.project_id, self.project_id) self.assertEqual(self.rule_instance.dataset_id, self.dataset_id) self.assertEqual(self.rule_instance.sandbox_dataset_id, self.sandbox_dataset_id)
def upload_to_bq(bq_filename): """Uploads json file from GCS to BigQuery""" bq_client = bigquery.Client(project=PROJECT_ID) logging.debug("uploading to BQ") job_id = 'bq_import_{}'.format( datetime.datetime.now().strftime("%Y%m%d%H%M%S")) gcs_file_path = 'gs://' + GCS_BUCKET + '/' + bq_filename job_config = bigquery.job.LoadJobConfig() job_config.schema = [ bigquery.SchemaField( 'project_id', 'STRING', description='GCP project where from where public dataset is'), bigquery.SchemaField('dataset_id', 'STRING', description='BigQuery dataset'), bigquery.SchemaField('table_id', 'STRING', description='Table id (name) of public dataset'), bigquery.SchemaField('dataset_description', 'STRING'), bigquery.SchemaField( 'table_description', 'STRING', ), bigquery.SchemaField('table_created', 'TIMESTAMP', description='Datetime when table was created'), bigquery.SchemaField( 'table_modified', 'TIMESTAMP', description= 'Datetime when table was last time modified (inserted rows)'), bigquery.SchemaField('table_num_bytes', 'INTEGER', description='Size of table in bytes'), bigquery.SchemaField('table_num_rows', 'INTEGER', description='Number of rows in the table'), bigquery.SchemaField( 'table_type', 'STRING', description='Whether table is normal table, view or BQ ML model'), bigquery.SchemaField( 'table_partitioning_type', 'STRING', description='Whether and how table is partitioned'), ] job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE dataset_ref = bq_client.dataset(OUT_DATASET_ID) table_ref = bigquery.TableReference(dataset_ref, OUT_TABLE_ID) job = bigquery.LoadJob(job_id, [gcs_file_path], client=bq_client, destination=table_ref, job_config=job_config) job.result() logging.debug("done")
def table_reference_from_string(table_ref: str): """ Parses reference string with format "{project}:{dataset}.{table}" into bigquery.TableReference """ project, dataset_and_table = table_ref.split(":") dataset, table_id = dataset_and_table.split(".") return bigquery.TableReference(bigquery.DatasetReference(project, dataset), table_id)
def get_table_if_exists(self, table_name): table_ref = bigquery.TableReference(self.dataset_ref, table_name) try: table = self.client.get_table(table_ref) except NotFound: return None else: return table
def get_table_refs(bq_client: bigquery.Client, bq_storage_client: bigquery_storage.BigQueryReadClient, dataset: bigquery.Dataset): query = f"select table_id from {dataset.project}.{dataset.dataset_id}.__TABLES__" df: pd.DataFrame = bq_client.query(query).to_dataframe( bqstorage_client=bq_storage_client) for table_id in df.table_id: if table_id is not None: yield bigquery.TableReference(dataset.reference, table_id)
def _table_from_ft(ft_schema: dict) -> bigquery.Table: """Create a local representation of a BigQuery table""" # A "TableSchema" is just a sequence of SchemaFields https://googleapis.dev/python/bigquery/latest/generated/google.cloud.bigquery.table.Table.html schema = list(map(_create_field_schema, ft_schema['columns'])) table = bigquery.Table( bigquery.TableReference(ds, to_safe_name(ft_schema['name'])), schema) table.description = ft_schema.get('description', '') return table
def copy_table(): """Copy temporary table to final destination""" logging.info("copy table") dataset_ref = bigquery.DatasetReference(GCP_PROJECT, DATASET_NAME) table_ref = bigquery.TableReference(dataset_ref, TABLE_NAME) copyjob_config = bigquery.CopyJobConfig() copyjob_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED copyjob_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE bq.copy_table(temp_table_ref, table_ref, job_config=copyjob_config)
def try_get_table(): try: table = bq_client.get_table( bigquery.TableReference( bigquery.DatasetReference(bq_project, bq_dataset), bq_table_id)) except NotFound: return None, False else: return table, True
def setUp(self): self.project_id = 'fake-recidiviz-project' self.mock_view_dataset_name = 'my_views_dataset' self.mock_dataset = bigquery.dataset.DatasetReference( self.project_id, self.mock_view_dataset_name) self.project_id_patcher = patch('recidiviz.utils.metadata.project_id') self.project_id_patcher.start().return_value = self.project_id self.project_number_patcher = patch( 'recidiviz.utils.metadata.project_number') self.project_number_patcher.start().return_value = '123456789' self.bq_client_patcher = mock.patch( 'recidiviz.calculator.calculation_data_storage_manager.BigQueryClientImpl' ) self.mock_client = self.bq_client_patcher.start().return_value self.mock_client.dataset_ref_for_id.return_value = self.mock_dataset self.mock_dataflow_tables = [ bigquery.TableReference(self.mock_dataset, 'fake_table_1'), bigquery.TableReference(self.mock_dataset, 'fake_table_2') ] self.mock_client.list_tables.return_value = self.mock_dataflow_tables self.mock_client.project_id.return_value = self.project_id self.data_storage_config_patcher = mock.patch( 'recidiviz.calculator.calculation_data_storage_config') self.mock_data_storage_config = self.data_storage_config_patcher.start( ) self.fake_dataflow_metrics_dataset = 'fake_dataflow_metrics_dataset' self.fake_cold_storage_dataset = 'fake_cold_storage_dataset' self.fake_max_jobs = 10 self.mock_data_storage_config.MAX_DAYS_IN_DATAFLOW_METRICS_TABLE.return_value = self.fake_max_jobs self.mock_data_storage_config.DATAFLOW_METRICS_COLD_STORAGE_DATASET = self.fake_cold_storage_dataset app = Flask(__name__) app.register_blueprint(calculation_data_storage_manager_blueprint) app.config['TESTING'] = True self.client = app.test_client()
def test_is_table_definition_in_match_with_bigquery(self): dataset_reference = bigquery.DatasetReference('project', 'dataset') table_reference = bigquery.TableReference(dataset_reference, 'table') schema = [ bigquery.SchemaField('col1', 'INTEGER'), bigquery.SchemaField('col2', 'STRING') ] table = bigquery.Table(table_reference, schema) assert schema_mapper.is_table_definition_in_match_with_bigquery( schema, table)
def deploy_features_table(): """Copy the UNION temp features table to final destination""" logging.info("copy table") target_dataset_ref = bigquery.DatasetReference(GCP_PROJECT, BQ_TARGET_DATASET) target_table_ref = bigquery.TableReference(target_dataset_ref, 'features') copyjob_config = bigquery.CopyJobConfig() copyjob_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED copyjob_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE bq.copy_table(temp_table_ref, target_table_ref, job_config=copyjob_config)
def copy_tables_to_public_dataset(): """copies all tables from internal dataset to BigQuery public dataset Service account which is used for access to public dataset project is stored in GCS. """ gcs = storage.Client(project=GCP_PROJECT) bucket = gcs.bucket(GCS_BUCKET) sa_blob = bucket.blob(SERVICE_ACCOUNT_FILENAME) sa_blob.reload() f = BytesIO() sa_blob.download_to_file(f) sa_content = f.getvalue().decode('utf8') f.close() sa_content = json.loads(sa_content) credentials = service_account.Credentials.from_service_account_info( sa_content) bq = bigquery.Client(project=GCP_PROJECT, credentials=credentials) job_config = bigquery.CopyJobConfig() job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED source_dataset = bigquery.DatasetReference(GCP_PROJECT, BQ_DATASET) destination_dataset = bigquery.DatasetReference(BQ_PUBLIC_PROJECT, BQ_PUBLIC_DATASET) table_refs = [] tables_res = bq.list_tables(source_dataset) for table_ref in tables_res: table_refs.append(table_ref) for table_ref in table_refs: table_name = table_ref.table_id source_table = bigquery.TableReference(source_dataset, table_name) destination_table = bigquery.TableReference(destination_dataset, table_name) bq.copy_table([source_table], destination_table, job_config=job_config) logging.info(f"copying table {table_name} to public dataset")
def test_add_missing_fields_to_schema(self): """Tests that the add_missing_fields_to_schema function calls the client to update the table.""" table_ref = bigquery.TableReference(self.mock_dataset, self.mock_table_id) schema_fields = [bigquery.SchemaField('fake_schema_field', 'STRING')] table = bigquery.Table(table_ref, schema_fields) self.mock_client.get_table.return_value = table new_schema_fields = [bigquery.SchemaField('new_schema_field', 'STRING')] self.bq_client.add_missing_fields_to_schema(self.mock_dataset_id, self.mock_table_id, new_schema_fields) self.mock_client.update_table.assert_called()
def test_offline_ingestion_from_bq_view(pytestconfig, bq_dataset, feast_client: Client, feast_spark_client: SparkClient): original = generate_data() bq_project = pytestconfig.getoption("bq_project") bq_client = bigquery.Client(project=bq_project) source_ref = bigquery.TableReference( bigquery.DatasetReference(bq_project, bq_dataset), f"ingestion_source_{datetime.now():%Y%m%d%H%M%s}", ) bq_client.load_table_from_dataframe(original, source_ref).result() view_ref = bigquery.TableReference( bigquery.DatasetReference(bq_project, bq_dataset), f"ingestion_view_{datetime.now():%Y%m%d%H%M%s}", ) view = bigquery.Table(view_ref) view.view_query = f"select * from `{source_ref.project}.{source_ref.dataset_id}.{source_ref.table_id}`" bq_client.create_table(view) entity = Entity(name="s2id", description="S2id", value_type=ValueType.INT64) feature_table = FeatureTable( name="bq_ingestion", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=BigQuerySource( event_timestamp_column="event_timestamp", table_ref= f"{view_ref.project}:{view_ref.dataset_id}.{view_ref.table_id}", ), ) feast_client.apply(entity) feast_client.apply(feature_table) ingest_and_verify(feast_client, feast_spark_client, feature_table, original)
def create_table_with_schema(self, dataset_id, table_id, schema_fields: List[bigquery.SchemaField]) -> \ bigquery.Table: dataset_ref = self.dataset_ref_for_id(dataset_id) if self.table_exists(dataset_ref, table_id): raise ValueError( f"Trying to create a table that already exists: {dataset_id}.{table_id}." ) table_ref = bigquery.TableReference(dataset_ref, table_id) table = bigquery.Table(table_ref, schema_fields) logging.info("Creating table %s.%s", dataset_id, table_id) return self.client.create_table(table)
def create_table(self, schema, dataset_id, table_id): """ :param schema: :param dataset_id: :param table_id: :return: """ dataset_ref = self.get_dataset(dataset_id=dataset_id) table_id = bigquery.TableReference(dataset_ref=dataset_ref, table_id=table_id) table = bigquery.Table(table_id, schema=schema) table = self.client.create_table(table) # API request print("Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id))
def test_remove_unused_fields_from_schema_no_missing_fields(self) -> None: """Tests that remove_unused_fields_from_schema() does nothing if there are no missing fields.""" table_ref = bigquery.TableReference(self.mock_dataset_ref, self.mock_table_id) schema_fields = [bigquery.SchemaField("field_1", "STRING")] table = bigquery.Table(table_ref, schema_fields) self.mock_client.get_table.return_value = table new_schema_fields = [bigquery.SchemaField("field_1", "STRING")] self.bq_client.remove_unused_fields_from_schema( self.mock_dataset_id, self.mock_table_id, new_schema_fields ) self.mock_client.query.assert_not_called()