def _federated_bq_regional_dataset_refresh( config: CloudSqlToBQConfig, dataset_override_prefix: Optional[str] = None, ) -> None: """Queries data in the appropriate CloudSQL instance for the given schema / conifg and loads it into a single, unified dataset **in the same** region as the CloudSQL instance. In the process, creates / updates views that provide direct federated connections to the CloudSQL instance and intermediate state-segmented datasets (where appropriate). Example resulting datasets (OPERATIONS schema): operations_cloudsql_connection <-- Federated views us_xx_operations_regional <-- Materialized data from most recent export for state us_yy_operations_regional operations_regional <-- Materialized data from most recent export for each state """ if config.is_state_segmented_refresh_schema(): collector: BigQueryViewCollector[ FederatedCloudSQLTableBigQueryViewBuilder] = StateSegmentedSchemaFederatedBigQueryViewCollector( config) else: collector = UnsegmentedSchemaFederatedBigQueryViewCollector(config) view_builders = collector.collect_view_builders() # TODO(#7285): Migrate Justice Counts connection to be in same region as instance if config.schema_type == SchemaType.JUSTICE_COUNTS: bq_region_override = None else: bq_region_override = SQLAlchemyEngineManager.get_cloudsql_instance_region( config.schema_type) dataset_overrides = None if dataset_override_prefix: dataset_overrides = dataset_overrides_for_view_builders( view_dataset_override_prefix=dataset_override_prefix, view_builders=view_builders, ) create_managed_dataset_and_deploy_views_for_view_builders( view_source_table_datasets=set(), view_builders_to_update=view_builders, dataset_overrides=dataset_overrides, bq_region_override=bq_region_override, force_materialize=True, ) if config.is_state_segmented_refresh_schema(): _hydrate_unioned_regional_dataset_for_schema(config, bq_region_override, dataset_override_prefix)
def __init__( self, *, config: CloudSqlToBQConfig, table: Table, state_codes: List[StateCode], ): if not config.is_state_segmented_refresh_schema(): raise ValueError( f"Unexpected schema type [{config.schema_type.name}]") self.config = config self.table = table self.state_codes = state_codes # Dataset prefixing will ge handled automatically by view building logic self.dataset_id = config.unioned_regional_dataset( dataset_override_prefix=None) self.view_id = f"{table.name}_view" self.materialized_address_override = BigQueryAddress( dataset_id=self.dataset_id, table_id=table.name, )
def _hydrate_unioned_regional_dataset_for_schema( config: CloudSqlToBQConfig, bq_region_override: Optional[str], dataset_override_prefix: Optional[str], ) -> None: """Given a set of already hydrated single-state datasets, unions the contents and copies the results to a dataset that lives in the same region as the CloudSQL instance (e.g. us-east1). For example given these tables: us_xx_operations_regional direct_ingest_raw_file_metadata direct_ingest_ingest_file_metadata us_yy_operations_regional direct_ingest_raw_file_metadata direct_ingest_ingest_file_metadata ...we will create a single dataset (or overwrite what exists): operations_regional direct_ingest_raw_file_metadata <-- has data from US_XX and US_YY direct_ingest_ingest_file_metadata <-- has data from US_XX and US_YY """ if not config.is_state_segmented_refresh_schema(): raise ValueError(f"Unexpected schema_type [{config.schema_type}].") state_codes = get_existing_direct_ingest_states() refreshed_source_table_datasets = { config.materialized_dataset_for_segment(state_code) for state_code in state_codes if state_code.value not in config.region_codes_to_exclude } stale_schema_datasets = { config.materialized_dataset_for_segment(state_code) for state_code in state_codes if state_code.value in config.region_codes_to_exclude } source_table_datasets = refreshed_source_table_datasets | stale_schema_datasets if stale_schema_datasets and refreshed_source_table_datasets: # We need to make sure the schemas match those that are refreshed. # # DISCLAIMER: if a column were renamed in a Postgres migration, that migration # would not be properly reflected with this schema update - the data in the new # column would be wiped for the new schemas. This code is meant to handle pure # column/table additions and deletions. reference_dataset_id = next(iter(refreshed_source_table_datasets)) if dataset_override_prefix: reference_dataset_id = f"{dataset_override_prefix}_{reference_dataset_id}" stale_schema_datasets = { f"{dataset_override_prefix}_{dataset_id}" for dataset_id in stale_schema_datasets } bq_client = BigQueryClientImpl(region_override=bq_region_override) bq_client.update_datasets_to_match_reference_schema( reference_dataset_id, list(stale_schema_datasets)) view_builders = [ UnionedStateSegmentsViewBuilder(config=config, table=t, state_codes=state_codes) for t in config.get_tables_to_export() ] dataset_overrides = None if dataset_override_prefix: dataset_overrides = dataset_overrides_for_view_builders( view_dataset_override_prefix=dataset_override_prefix, view_builders=view_builders, ) for dataset in source_table_datasets: dataset_overrides[dataset] = f"{dataset_override_prefix}_{dataset}" create_managed_dataset_and_deploy_views_for_view_builders( view_source_table_datasets=source_table_datasets, view_builders_to_update=view_builders, dataset_overrides=dataset_overrides, bq_region_override=bq_region_override, force_materialize=True, )
def __init__(self, config: CloudSqlToBQConfig): if config.is_state_segmented_refresh_schema(): raise ValueError( f"Only valid for unsegmented schema types. Cannot be instantiated with " f"schema_type [{config.schema_type}]") self.config = config