Esempio n. 1
0
def _hydrate_unioned_regional_dataset_for_schema(
    config: CloudSqlToBQConfig,
    bq_region_override: Optional[str],
    dataset_override_prefix: Optional[str],
) -> None:
    """Given a set of already hydrated single-state datasets, unions the contents
    and copies the results to a dataset that lives in the same region as the CloudSQL
    instance (e.g. us-east1).

    For example given these tables:
        us_xx_operations_regional
            direct_ingest_raw_file_metadata
            direct_ingest_ingest_file_metadata
        us_yy_operations_regional
            direct_ingest_raw_file_metadata
            direct_ingest_ingest_file_metadata

    ...we will create a single dataset (or overwrite what exists):
        operations_regional
            direct_ingest_raw_file_metadata  <-- has data from US_XX and US_YY
            direct_ingest_ingest_file_metadata  <-- has data from US_XX and US_YY
    """

    if not config.is_state_segmented_refresh_schema():
        raise ValueError(f"Unexpected schema_type [{config.schema_type}].")

    state_codes = get_existing_direct_ingest_states()

    refreshed_source_table_datasets = {
        config.materialized_dataset_for_segment(state_code)
        for state_code in state_codes
        if state_code.value not in config.region_codes_to_exclude
    }

    stale_schema_datasets = {
        config.materialized_dataset_for_segment(state_code)
        for state_code in state_codes
        if state_code.value in config.region_codes_to_exclude
    }
    source_table_datasets = refreshed_source_table_datasets | stale_schema_datasets

    if stale_schema_datasets and refreshed_source_table_datasets:
        # We need to make sure the schemas match those that are refreshed.
        #
        # DISCLAIMER: if a column were renamed in a Postgres migration, that migration
        # would not be properly reflected with this schema update - the data in the new
        # column would be wiped for the new schemas. This code is meant to handle pure
        # column/table additions and deletions.
        reference_dataset_id = next(iter(refreshed_source_table_datasets))
        if dataset_override_prefix:
            reference_dataset_id = f"{dataset_override_prefix}_{reference_dataset_id}"
            stale_schema_datasets = {
                f"{dataset_override_prefix}_{dataset_id}"
                for dataset_id in stale_schema_datasets
            }

        bq_client = BigQueryClientImpl(region_override=bq_region_override)
        bq_client.update_datasets_to_match_reference_schema(
            reference_dataset_id, list(stale_schema_datasets))

    view_builders = [
        UnionedStateSegmentsViewBuilder(config=config,
                                        table=t,
                                        state_codes=state_codes)
        for t in config.get_tables_to_export()
    ]
    dataset_overrides = None
    if dataset_override_prefix:
        dataset_overrides = dataset_overrides_for_view_builders(
            view_dataset_override_prefix=dataset_override_prefix,
            view_builders=view_builders,
        )
        for dataset in source_table_datasets:
            dataset_overrides[dataset] = f"{dataset_override_prefix}_{dataset}"

    create_managed_dataset_and_deploy_views_for_view_builders(
        view_source_table_datasets=source_table_datasets,
        view_builders_to_update=view_builders,
        dataset_overrides=dataset_overrides,
        bq_region_override=bq_region_override,
        force_materialize=True,
    )
Esempio n. 2
0
def export_all_tables(cloud_sql_to_bq_config: CloudSqlToBQConfig) -> None:
    tables = cloud_sql_to_bq_config.get_tables_to_export()
    for table in tables:
        export_table(table.name, cloud_sql_to_bq_config)