Exemple #1
0
def _federated_bq_regional_dataset_refresh(
    config: CloudSqlToBQConfig,
    dataset_override_prefix: Optional[str] = None,
) -> None:
    """Queries data in the appropriate CloudSQL instance for the given schema / conifg
    and loads it into a single, unified dataset **in the same** region as the CloudSQL
    instance. In the process, creates / updates views that provide direct federated
    connections to the CloudSQL instance and intermediate state-segmented datasets
    (where appropriate).

    Example resulting datasets (OPERATIONS schema):
      operations_cloudsql_connection  <-- Federated views
      us_xx_operations_regional  <-- Materialized data from most recent export for state
      us_yy_operations_regional
      operations_regional  <-- Materialized data from most recent export for each state
    """

    if config.is_state_segmented_refresh_schema():
        collector: BigQueryViewCollector[
            FederatedCloudSQLTableBigQueryViewBuilder] = StateSegmentedSchemaFederatedBigQueryViewCollector(
                config)
    else:
        collector = UnsegmentedSchemaFederatedBigQueryViewCollector(config)

    view_builders = collector.collect_view_builders()

    # TODO(#7285): Migrate Justice Counts connection to be in same region as instance
    if config.schema_type == SchemaType.JUSTICE_COUNTS:
        bq_region_override = None
    else:
        bq_region_override = SQLAlchemyEngineManager.get_cloudsql_instance_region(
            config.schema_type)

    dataset_overrides = None
    if dataset_override_prefix:
        dataset_overrides = dataset_overrides_for_view_builders(
            view_dataset_override_prefix=dataset_override_prefix,
            view_builders=view_builders,
        )
    create_managed_dataset_and_deploy_views_for_view_builders(
        view_source_table_datasets=set(),
        view_builders_to_update=view_builders,
        dataset_overrides=dataset_overrides,
        bq_region_override=bq_region_override,
        force_materialize=True,
    )

    if config.is_state_segmented_refresh_schema():
        _hydrate_unioned_regional_dataset_for_schema(config,
                                                     bq_region_override,
                                                     dataset_override_prefix)
Exemple #2
0
    def __init__(
        self,
        *,
        config: CloudSqlToBQConfig,
        table: Table,
        state_codes: List[StateCode],
    ):
        if not config.is_state_segmented_refresh_schema():
            raise ValueError(
                f"Unexpected schema type [{config.schema_type.name}]")

        self.config = config
        self.table = table
        self.state_codes = state_codes
        # Dataset prefixing will ge handled automatically by view building logic
        self.dataset_id = config.unioned_regional_dataset(
            dataset_override_prefix=None)
        self.view_id = f"{table.name}_view"
        self.materialized_address_override = BigQueryAddress(
            dataset_id=self.dataset_id,
            table_id=table.name,
        )
Exemple #3
0
def _hydrate_unioned_regional_dataset_for_schema(
    config: CloudSqlToBQConfig,
    bq_region_override: Optional[str],
    dataset_override_prefix: Optional[str],
) -> None:
    """Given a set of already hydrated single-state datasets, unions the contents
    and copies the results to a dataset that lives in the same region as the CloudSQL
    instance (e.g. us-east1).

    For example given these tables:
        us_xx_operations_regional
            direct_ingest_raw_file_metadata
            direct_ingest_ingest_file_metadata
        us_yy_operations_regional
            direct_ingest_raw_file_metadata
            direct_ingest_ingest_file_metadata

    ...we will create a single dataset (or overwrite what exists):
        operations_regional
            direct_ingest_raw_file_metadata  <-- has data from US_XX and US_YY
            direct_ingest_ingest_file_metadata  <-- has data from US_XX and US_YY
    """

    if not config.is_state_segmented_refresh_schema():
        raise ValueError(f"Unexpected schema_type [{config.schema_type}].")

    state_codes = get_existing_direct_ingest_states()

    refreshed_source_table_datasets = {
        config.materialized_dataset_for_segment(state_code)
        for state_code in state_codes
        if state_code.value not in config.region_codes_to_exclude
    }

    stale_schema_datasets = {
        config.materialized_dataset_for_segment(state_code)
        for state_code in state_codes
        if state_code.value in config.region_codes_to_exclude
    }
    source_table_datasets = refreshed_source_table_datasets | stale_schema_datasets

    if stale_schema_datasets and refreshed_source_table_datasets:
        # We need to make sure the schemas match those that are refreshed.
        #
        # DISCLAIMER: if a column were renamed in a Postgres migration, that migration
        # would not be properly reflected with this schema update - the data in the new
        # column would be wiped for the new schemas. This code is meant to handle pure
        # column/table additions and deletions.
        reference_dataset_id = next(iter(refreshed_source_table_datasets))
        if dataset_override_prefix:
            reference_dataset_id = f"{dataset_override_prefix}_{reference_dataset_id}"
            stale_schema_datasets = {
                f"{dataset_override_prefix}_{dataset_id}"
                for dataset_id in stale_schema_datasets
            }

        bq_client = BigQueryClientImpl(region_override=bq_region_override)
        bq_client.update_datasets_to_match_reference_schema(
            reference_dataset_id, list(stale_schema_datasets))

    view_builders = [
        UnionedStateSegmentsViewBuilder(config=config,
                                        table=t,
                                        state_codes=state_codes)
        for t in config.get_tables_to_export()
    ]
    dataset_overrides = None
    if dataset_override_prefix:
        dataset_overrides = dataset_overrides_for_view_builders(
            view_dataset_override_prefix=dataset_override_prefix,
            view_builders=view_builders,
        )
        for dataset in source_table_datasets:
            dataset_overrides[dataset] = f"{dataset_override_prefix}_{dataset}"

    create_managed_dataset_and_deploy_views_for_view_builders(
        view_source_table_datasets=source_table_datasets,
        view_builders_to_update=view_builders,
        dataset_overrides=dataset_overrides,
        bq_region_override=bq_region_override,
        force_materialize=True,
    )
 def __init__(self, config: CloudSqlToBQConfig):
     if config.is_state_segmented_refresh_schema():
         raise ValueError(
             f"Only valid for unsegmented schema types. Cannot be instantiated with "
             f"schema_type [{config.schema_type}]")
     self.config = config