def main(
    sandbox_dataset_prefix: str,
    schema_type: SchemaType,
    direct_ingest_instance: Optional[DirectIngestInstance],
) -> None:
    """Defines the main function responsible for moving data from Postgres to BQ."""
    logging.info("Prefixing all output datasets with [%s_].",
                 known_args.sandbox_dataset_prefix)
    fake_gcs = FakeGCSFileSystem()

    # We mock the export config to a version that does not have any paused regions.
    with mock.patch(
            f"{cloud_sql_to_bq_refresh_config.__name__}.GcsfsFactory.build",
            return_value=fake_gcs,
    ):
        fake_gcs.upload_from_string(
            path=CloudSqlToBQConfig.default_config_path(),
            contents=STANDARD_YAML_CONTENTS,
            content_type="text/yaml",
        )
        federated_bq_schema_refresh(
            schema_type=schema_type,
            direct_ingest_instance=direct_ingest_instance,
            dataset_override_prefix=sandbox_dataset_prefix,
        )
        config = CloudSqlToBQConfig.for_schema_type(schema_type)
        final_destination_dataset = config.unioned_multi_region_dataset(
            dataset_override_prefix=sandbox_dataset_prefix)

    logging.info("Load complete. Data loaded to dataset [%s].",
                 final_destination_dataset)
 def test_for_schema_type_returns_instance(self) -> None:
     for schema_type in self.schema_types:
         if not CloudSqlToBQConfig.is_valid_schema_type(schema_type):
             with self.assertRaises(ValueError):
                 _ = CloudSqlToBQConfig.for_schema_type(schema_type)
         else:
             config = CloudSqlToBQConfig.for_schema_type(schema_type)
             self.assertIsInstance(config, CloudSqlToBQConfig)
Esempio n. 3
0
def _federated_bq_regional_dataset_refresh(
    config: CloudSqlToBQConfig,
    dataset_override_prefix: Optional[str] = None,
) -> None:
    """Queries data in the appropriate CloudSQL instance for the given schema / conifg
    and loads it into a single, unified dataset **in the same** region as the CloudSQL
    instance. In the process, creates / updates views that provide direct federated
    connections to the CloudSQL instance and intermediate state-segmented datasets
    (where appropriate).

    Example resulting datasets (OPERATIONS schema):
      operations_cloudsql_connection  <-- Federated views
      us_xx_operations_regional  <-- Materialized data from most recent export for state
      us_yy_operations_regional
      operations_regional  <-- Materialized data from most recent export for each state
    """

    if config.is_state_segmented_refresh_schema():
        collector: BigQueryViewCollector[
            FederatedCloudSQLTableBigQueryViewBuilder] = StateSegmentedSchemaFederatedBigQueryViewCollector(
                config)
    else:
        collector = UnsegmentedSchemaFederatedBigQueryViewCollector(config)

    view_builders = collector.collect_view_builders()

    # TODO(#7285): Migrate Justice Counts connection to be in same region as instance
    if config.schema_type == SchemaType.JUSTICE_COUNTS:
        bq_region_override = None
    else:
        bq_region_override = SQLAlchemyEngineManager.get_cloudsql_instance_region(
            config.schema_type)

    dataset_overrides = None
    if dataset_override_prefix:
        dataset_overrides = dataset_overrides_for_view_builders(
            view_dataset_override_prefix=dataset_override_prefix,
            view_builders=view_builders,
        )
    create_managed_dataset_and_deploy_views_for_view_builders(
        view_source_table_datasets=set(),
        view_builders_to_update=view_builders,
        dataset_overrides=dataset_overrides,
        bq_region_override=bq_region_override,
        force_materialize=True,
    )

    if config.is_state_segmented_refresh_schema():
        _hydrate_unioned_regional_dataset_for_schema(config,
                                                     bq_region_override,
                                                     dataset_override_prefix)
Esempio n. 4
0
def export_table(table_name: str, cloud_sql_to_bq_config: CloudSqlToBQConfig) -> bool:
    """Export a Cloud SQL table to a CSV file on GCS.

    Given a table name and export_query, retrieve the export URI from
    cloud_sql_to_bq_config, then execute the export operation and wait until it
    completes.

    Args:
        table_name: Table to export.
        cloud_sql_to_bq_config: The export config class for the table's SchemaType.
    Returns:
        True if operation succeeded without errors, False if not.
    """
    schema_type = cloud_sql_to_bq_config.schema_type
    export_query = cloud_sql_to_bq_config.get_table_export_query(table_name)
    export_uri = cloud_sql_to_bq_config.get_gcs_export_uri_for_table(table_name)

    export_context = create_export_context(schema_type, export_uri, export_query)

    project_id = metadata.project_id()
    instance_id = SQLAlchemyEngineManager.get_stripped_cloudsql_instance_id(schema_type)
    export_request = (
        sqladmin_client()
        .instances()
        .export(project=project_id, instance=instance_id, body=export_context)
    )

    logging.info("GCS URI [%s] in project [%s]", export_uri, project_id)
    logging.info("Starting export: [%s]", str(export_request.to_json()))

    try:
        response = export_request.execute()
    except googleapiclient.errors.HttpError:
        logging.exception("Failed to export table [%s]", table_name)
        return False

    # We need to block until the operation is done because
    # the Cloud SQL API only supports one operation at a time.
    operation_id = response["name"]
    logging.info(
        "Waiting for export operation [%s] to complete for table [%s] "
        "in database [%s] in project [%s]",
        operation_id,
        table_name,
        instance_id,
        project_id,
    )
    operation_success = wait_until_operation_finished(operation_id)

    return operation_success
Esempio n. 5
0
def _copy_regional_dataset_to_multi_region(
        config: CloudSqlToBQConfig,
        dataset_override_prefix: Optional[str]) -> None:
    """Copies the unioned regional dataset for a schema to the multi-region dataset
    that contains the same data. Backs up the multi-region dataset before performing
    the copy. This backup dataset will get cleaned up if the copy succeeds, but
    otherwise will stick around for 1 week before tables expire.
    """
    bq_client = BigQueryClientImpl()

    source_dataset_id = config.unioned_regional_dataset(
        dataset_override_prefix)
    destination_dataset_id = config.unioned_multi_region_dataset(
        dataset_override_prefix)
    destination_dataset = bq_client.dataset_ref_for_id(destination_dataset_id)

    backup_dataset = bq_client.backup_dataset_tables_if_dataset_exists(
        destination_dataset_id)

    try:
        if bq_client.dataset_exists(destination_dataset):
            tables = bq_client.list_tables(destination_dataset_id)
            for table in tables:
                bq_client.delete_table(table.dataset_id, table.table_id)

        bq_client.create_dataset_if_necessary(
            destination_dataset,
            default_table_expiration_ms=TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS
            if dataset_override_prefix else None,
        )

        # Copy into the canonical unioned source datasets in the US multi-region
        bq_client.copy_dataset_tables_across_regions(
            source_dataset_id=source_dataset_id,
            destination_dataset_id=destination_dataset_id,
        )
    except Exception as e:
        logging.info(
            "Failed to flash [%s] to [%s] - contents backup can be found at [%s]",
            source_dataset_id,
            destination_dataset_id,
            backup_dataset.dataset_id if backup_dataset else "NO BACKUP",
        )
        raise e

    if backup_dataset:
        bq_client.delete_dataset(backup_dataset,
                                 delete_contents=True,
                                 not_found_ok=True)
Esempio n. 6
0
def refresh_bq_table() -> Tuple[str, int]:
    """Worker function to handle BQ export task requests.

    Form data must be a bytes-encoded JSON object with parameters listed below.

    URL Parameters:
        table_name: Table to export then import. Table must be defined
            in one of the base schema types.
    """
    json_data = request.get_data(as_text=True)
    data = json.loads(json_data)
    table_name = data['table_name']
    schema_type_str = data['schema_type']

    try:
        schema_type = SchemaType(schema_type_str)
    except ValueError:
        return (f'Unknown schema type [{schema_type_str}]',
                HTTPStatus.BAD_REQUEST)

    bq_client = BigQueryClientImpl()
    cloud_sql_to_bq_config = CloudSqlToBQConfig.for_schema_type(schema_type)

    logging.info("Starting BQ export task for table: %s", table_name)

    export_table_then_load_table(bq_client, table_name, cloud_sql_to_bq_config)
    return ('', HTTPStatus.OK)
 def test_incorrect_direct_ingest_instance_raises(self) -> None:
     for schema_type in self.enabled_schema_types:
         if schema_type != SchemaType.STATE:
             with self.assertRaises(ValueError):
                 _ = CloudSqlToBQConfig.for_schema_type(
                     schema_type, DirectIngestInstance.PRIMARY
                 )
    def setUp(self) -> None:
        self.schema_types: List[SchemaType] = list(SchemaType)
        self.enabled_schema_types = [
            schema_type
            for schema_type in self.schema_types
            if CloudSqlToBQConfig.is_valid_schema_type(schema_type)
        ]
        self.mock_project_id = "fake-recidiviz-project"
        self.metadata_patcher = mock.patch(
            "recidiviz.persistence.database.bq_refresh.cloud_sql_to_bq_refresh_config.metadata"
        )
        self.mock_metadata = self.metadata_patcher.start()
        self.mock_metadata.project_id.return_value = self.mock_project_id

        self.gcs_factory_patcher = mock.patch(
            "recidiviz.persistence.database.bq_refresh.cloud_sql_to_bq_refresh_config.GcsfsFactory.build"
        )
        self.fake_gcs = FakeGCSFileSystem()
        self.gcs_factory_patcher.start().return_value = self.fake_gcs
        self.set_config_yaml(
            """
region_codes_to_exclude:
  - US_ND
state_history_tables_to_include:
  - state_person_history
county_columns_to_exclude:
  person:
    - full_name
    - birthdate_inferred_from_age
"""
        )
Esempio n. 9
0
    def test_get_tables_to_export(self) -> None:
        """Assertions for the method get_tables_to_export
        1. Assert that it returns a list of type sqlalchemy.Table
        2. For the StateBase schema, assert that included history tables are included
        3. For the StateBase schema, assert that other history tables are excluded
        """
        for schema_type in self.enabled_schema_types:
            config = CloudSqlToBQConfig.for_schema_type(schema_type)
            assert config is not None
            tables_to_export = config.get_tables_to_export()

            self.assertIsInstance(tables_to_export, List)

            for table in tables_to_export:
                self.assertIsInstance(table, sqlalchemy.Table)

            if schema_type == SchemaType.STATE:
                history_tables_to_include = config.history_tables_to_include
                for history_table in history_tables_to_include:
                    table_names = list(map(lambda t: t.name, tables_to_export))
                    self.assertIn(history_table, table_names)

                for table in config.sorted_tables:
                    if ("history" in table.name and table.name
                            not in config.history_tables_to_include):
                        self.assertNotIn(table, tables_to_export)
 def test_build_unioned_segments_view_with_dataset_overrides(self) -> None:
     config = CloudSqlToBQConfig.for_schema_type(SchemaType.STATE)
     view_builder = UnionedStateSegmentsViewBuilder(
         config=config,
         table=StateBase.metadata.tables["state_person_external_id"],
         state_codes=[StateCode.US_XX, StateCode.US_WW],
     )
     view = view_builder.build(
         dataset_overrides={
             "state_regional": "my_prefix_state_regional",
             "us_xx_state_regional": "my_prefix_us_xx_state_regional",
             "us_ww_state_regional": "my_prefix_us_ww_state_regional",
         }
     )
     expected_query = (
         "SELECT state_person_external_id.external_id,state_person_external_id.state_code,"
         "state_person_external_id.id_type,state_person_external_id.person_external_id_id,"
         "state_person_external_id.person_id FROM "
         "`recidiviz-staging.my_prefix_us_xx_state_regional.state_person_external_id` state_person_external_id\n"
         "UNION ALL\n"
         "SELECT state_person_external_id.external_id,state_person_external_id.state_code,"
         "state_person_external_id.id_type,state_person_external_id.person_external_id_id,"
         "state_person_external_id.person_id FROM "
         "`recidiviz-staging.my_prefix_us_ww_state_regional.state_person_external_id` state_person_external_id"
     )
     self.assertEqual(expected_query, view.view_query)
     self.assertEqual(
         BigQueryAddress(
             dataset_id="my_prefix_state_regional",
             table_id="state_person_external_id",
         ),
         view.materialized_address,
     )
Esempio n. 11
0
    def test_yaml_config_reads_correctly_JAILS(
        self,
        schema: SchemaType,
        regions_to_exclude: List[str],
        columns_to_exclude: Dict[str, List[str]],
        history_tables_to_include: List[str],
    ) -> None:
        config = CloudSqlToBQConfig.for_schema_type(schema)
        assert config is not None

        self.assertListsDistinctAndEqual(
            regions_to_exclude,
            config.region_codes_to_exclude,
            msg_prefix="Region codes",
        )
        self.assertListsDistinctAndEqual(
            history_tables_to_include,
            config.history_tables_to_include,
            msg_prefix="History tables",
        )

        self.assertListsDistinctAndEqual(
            list(columns_to_exclude.keys()),
            list(config.columns_to_exclude.keys()),
            msg_prefix="Excluded columns keys",
        )
        for k in columns_to_exclude.keys():
            self.assertListsDistinctAndEqual(
                columns_to_exclude[k],
                config.columns_to_exclude[k],
                msg_prefix=f"Excluded columsn for {k}",
            )
def create_all_bq_refresh_tasks_for_schema(schema_arg: str) -> None:
    """Creates an export task for each table to be exported.

    A task is created for each table defined in the schema.

    Re-creates all tasks if any task fails to be created.
    """
    try:
        schema_type = SchemaType(schema_arg.upper())
    except ValueError:
        return

    logging.info("Beginning BQ export for %s schema tables.",
                 schema_type.value)

    task_manager = BQRefreshCloudTaskManager()

    cloud_sql_to_bq_config = CloudSqlToBQConfig.for_schema_type(schema_type)
    if cloud_sql_to_bq_config is None:
        logging.info("Cloud SQL to BQ is disabled for: %s", schema_type)
        return

    for table in cloud_sql_to_bq_config.get_tables_to_export():
        task_manager.create_refresh_bq_table_task(table.name, schema_type)

    if schema_type is SchemaType.STATE:
        pub_sub_topic = "v1.calculator.trigger_daily_pipelines"
        pub_sub_message = "State export to BQ complete"
    else:
        pub_sub_topic = ""
        pub_sub_message = ""

    task_manager.create_bq_refresh_monitor_task(schema_type.value,
                                                pub_sub_topic, pub_sub_message)
Esempio n. 13
0
 def test_for_schema_type_returns_instance(self) -> None:
     for schema_type in self.schema_types:
         config = CloudSqlToBQConfig.for_schema_type(schema_type)
         if schema_type in self.disabled_schema_types:
             self.assertIsNone(config)
         else:
             self.assertIsInstance(config, CloudSqlToBQConfig)
    def update_data_freshness_results(self) -> None:
        """Refreshes information in the metadata store about freshness of ingested data
        for all states."""
        bq_export_config = CloudSqlToBQConfig.for_schema_type(
            SchemaType.STATE,
            yaml_path=GcsfsFilePath.from_absolute_path(
                f"gs://{self.project_id}-configs/cloud_sql_to_bq_config.yaml"
            ),
        )
        if bq_export_config is None:
            raise ValueError("STATE CloudSqlToBQConfig unexpectedly None.")

        regions_paused = bq_export_config.region_codes_to_exclude

        latest_upper_bounds_path = GcsfsFilePath.from_absolute_path(
            f"gs://{self.project_id}-ingest-metadata/ingest_metadata_latest_ingested_upper_bounds.json"
        )
        latest_upper_bounds_json = self.gcs_fs.download_as_string(
            latest_upper_bounds_path
        )
        latest_upper_bounds = []

        for line in latest_upper_bounds_json.splitlines():
            line = line.strip()
            if not line:
                continue
            struct = json.loads(line)
            latest_upper_bounds.append(
                {
                    "state": struct["state_code"],
                    "date": struct.get("processed_date"),
                    "ingestPaused": struct["state_code"] in regions_paused,
                }
            )
        self.data_freshness_results = latest_upper_bounds
Esempio n. 15
0
def load_table_from_gcs_and_wait(
    big_query_client: BigQueryClient,
    table_name: str,
    cloud_sql_to_bq_config: CloudSqlToBQConfig,
    destination_table_id: str,
) -> None:
    """Loads a table from CSV data in GCS to BigQuery.

    Given a table name and a destination_table_id, retrieve the export URI and schema from cloud_sql_to_bq_config,
    then load the table into the destination_table_id.

    This starts the job, but does not wait until it completes.

    Tables are created if they do not exist, and overwritten if they do exist.

    Because we are using bigquery.WriteDisposition.WRITE_TRUNCATE, the table's
    data will be completely wiped and overwritten with the contents of the CSV.

    Args:
        big_query_client: A BigQueryClient.
        table_name: Table to import. Table must be defined in the base schema.
        cloud_sql_to_bq_config: Export config class for a specific SchemaType.
        destination_table_id: Optional destination table name. If none is given,
        the provided table name is used.
    Returns:
        If the table load succeeds, returns None. If it fails it raises a ValueError.
    """
    uri = cloud_sql_to_bq_config.get_gcs_export_uri_for_table(table_name)

    logging.info("GCS URI [%s] in project [%s]", uri, metadata.project_id())

    bq_schema = cloud_sql_to_bq_config.get_bq_schema_for_table(table_name)
    dataset_ref = cloud_sql_to_bq_config.get_dataset_ref(big_query_client)

    load_job = big_query_client.load_table_from_cloud_storage_async(
        source_uri=uri,
        destination_dataset_ref=dataset_ref,
        destination_table_id=destination_table_id,
        destination_table_schema=bq_schema,
    )

    table_load_success = wait_for_table_load(big_query_client, load_job)

    if not table_load_success:
        raise ValueError(
            f"Copy from cloud storage to temp table failed. Skipping refresh for BQ table [{table_name}]"
        )
Esempio n. 16
0
def refresh_bq_schema(schema_arg: str) -> Tuple[str, HTTPStatus]:
    """Performs a full refresh of BigQuery data for a given schema, pulling data from
    the appropriate CloudSQL Postgres instance.

    On completion, triggers Dataflow pipelines (when necessary), releases the refresh
    lock and restarts any paused ingest work.
    """
    try:
        schema_type = SchemaType(schema_arg.upper())
    except ValueError:
        return (
            f"Unexpected value for schema_arg: [{schema_arg}]",
            HTTPStatus.BAD_REQUEST,
        )
    if not CloudSqlToBQConfig.is_valid_schema_type(schema_type):
        return (
            f"Unsupported schema type: [{schema_type}]",
            HTTPStatus.BAD_REQUEST,
        )

    lock_manager = CloudSqlToBQLockManager()

    try:
        can_proceed = lock_manager.can_proceed(schema_type)
    except GCSPseudoLockDoesNotExist as e:
        logging.exception(e)
        return (
            f"Expected lock for [{schema_arg}] BQ refresh to already exist.",
            HTTPStatus.EXPECTATION_FAILED,
        )

    if not can_proceed:
        return (
            f"Expected to be able to proceed with refresh before this endpoint was "
            f"called for [{schema_arg}].",
            HTTPStatus.EXPECTATION_FAILED,
        )

    federated_bq_schema_refresh(schema_type=schema_type)

    # Publish a message to the Pub/Sub topic once state BQ export is complete
    if schema_type is SchemaType.STATE:
        pubsub_helper.publish_message_to_topic(
            message="State export to BQ complete",
            topic="v1.calculator.trigger_daily_pipelines",
        )

    # Unlock export lock when all BQ exports complete
    lock_manager = CloudSqlToBQLockManager()
    lock_manager.release_lock(schema_type)
    logging.info(
        "Done running refresh for [%s], unlocking Postgres to BigQuery export",
        schema_type.value,
    )

    # Kick scheduler to restart ingest
    kick_all_schedulers()

    return "", HTTPStatus.OK
Esempio n. 17
0
    def test_collect_do_not_crash(self) -> None:
        self.fake_fs.upload_from_string(
            path=self.fake_config_path,
            contents=PAUSED_REGION_CLOUD_SQL_CONFIG_YAML,
            content_type="text/yaml",
        )
        for schema_type in SchemaType:
            if not CloudSqlToBQConfig.is_valid_schema_type(schema_type):
                continue
            config = CloudSqlToBQConfig.for_schema_type(schema_type)

            if config.is_state_segmented_refresh_schema():
                _ = StateSegmentedSchemaFederatedBigQueryViewCollector(
                    config).collect_view_builders()
            else:
                _ = UnsegmentedSchemaFederatedBigQueryViewCollector(
                    config).collect_view_builders()
 def test_unioned_segments_view_unsegmented_config_crashes(self) -> None:
     config = CloudSqlToBQConfig.for_schema_type(SchemaType.JAILS)
     with self.assertRaises(ValueError) as e:
         _ = UnionedStateSegmentsViewBuilder(
             config=config,
             table=JailsBase.metadata.sorted_tables[0],
             state_codes=[StateCode.US_XX],
         )
     self.assertEqual(str(e.exception), "Unexpected schema type [JAILS]")
Esempio n. 19
0
    def test_get_bq_schema_for_table_region_code_in_schema(self) -> None:
        """Assert that the region code is included in the schema for association tables in the State schema."""
        association_table_name = (
            "state_supervision_period_program_assignment_association")
        config = CloudSqlToBQConfig.for_schema_type(SchemaType.STATE)
        assert config is not None
        region_code_col = "state_code"
        schema = config.get_bq_schema_for_table(association_table_name)

        self.assertIn(region_code_col,
                      [schema_field.name for schema_field in schema])
Esempio n. 20
0
def delete_temp_table_if_exists(
        big_query_client: BigQueryClient, temp_table_name: str,
        cloud_sql_to_bq_config: CloudSqlToBQConfig) -> None:
    dataset_ref = cloud_sql_to_bq_config.get_dataset_ref(big_query_client)
    if not big_query_client.table_exists(dataset_ref=dataset_ref,
                                         table_id=temp_table_name):
        logging.info('Delete temp table failed, table [%s] does not exist.',
                     temp_table_name)
        return
    big_query_client.delete_table(dataset_id=cloud_sql_to_bq_config.dataset_id,
                                  table_id=temp_table_name)
    logging.info('Deleted temporary table [%s]', temp_table_name)
Esempio n. 21
0
 def test_get_gcs_export_uri_for_table(self) -> None:
     """Test that get_gcs_export_uri_for_table generates a GCS URI
     with the correct project ID and table name.
     """
     config = CloudSqlToBQConfig.for_schema_type(SchemaType.JAILS)
     assert config is not None
     fake_table = "my_fake_table"
     bucket = "{}-dbexport".format(self.mock_project_id)
     gcs_export_uri = "gs://{bucket}/{table_name}.csv".format(
         bucket=bucket, table_name=fake_table)
     self.assertEqual(gcs_export_uri,
                      config.get_gcs_export_uri_for_table(fake_table))
Esempio n. 22
0
 def test_get_stale_bq_rows_for_excluded_regions_query_builder_jails_schema(
     self, ) -> None:
     """Given a JAILS schema, a table name and None for region_codes_to_exclude, assert that it returns a
     query builder that returns no rows"""
     filter_clause = "WHERE FALSE"
     config = CloudSqlToBQConfig.for_schema_type(SchemaType.JAILS)
     assert config is not None
     for table in config.get_tables_to_export():
         query_builder = config.get_stale_bq_rows_for_excluded_regions_query_builder(
             table.name)
         self.assertIsInstance(
             query_builder, BigQuerySchemaTableRegionFilteredQueryBuilder)
         self.assertEqual(filter_clause, query_builder.filter_clause())
    def test_unioned_multi_region_dataset(self) -> None:
        for schema_type in self.enabled_schema_types:
            config = CloudSqlToBQConfig.for_schema_type(schema_type)
            dataset = config.unioned_multi_region_dataset(dataset_override_prefix=None)
            self.assertFalse(dataset.endswith("regional"))
            self.assertTrue(dataset in VIEW_SOURCE_TABLE_DATASETS)

            dataset_with_prefix = config.unioned_multi_region_dataset(
                dataset_override_prefix="prefix"
            )
            self.assertTrue(dataset_with_prefix.startswith("prefix_"))
            self.assertFalse(dataset_with_prefix.endswith("regional"))
            self.assertTrue(dataset_with_prefix not in VIEW_SOURCE_TABLE_DATASETS)
 def test_excluded_columns(self) -> None:
     for schema_type in self.enabled_schema_types:
         config = CloudSqlToBQConfig.for_schema_type(schema_type)
         for table in config.sorted_tables:
             # pylint: disable=protected-access
             columns = config._get_table_columns_to_export(table)
             for column in columns:
                 self.assertIsInstance(column, str)
                 self.assertTrue(
                     column not in config.columns_to_exclude.get(table.name, []),
                     msg=f"Column {column} should not be included. It is found in "
                     f"COUNTY_COLUMNS_TO_EXCLUDE` for this table {table.name}.",
                 )
Esempio n. 25
0
    def __init__(
        self,
        *,
        config: CloudSqlToBQConfig,
        table: Table,
        state_codes: List[StateCode],
    ):
        if not config.is_state_segmented_refresh_schema():
            raise ValueError(
                f"Unexpected schema type [{config.schema_type.name}]")

        self.config = config
        self.table = table
        self.state_codes = state_codes
        # Dataset prefixing will ge handled automatically by view building logic
        self.dataset_id = config.unioned_regional_dataset(
            dataset_override_prefix=None)
        self.view_id = f"{table.name}_view"
        self.materialized_address_override = BigQueryAddress(
            dataset_id=self.dataset_id,
            table_id=table.name,
        )
Esempio n. 26
0
    def test_unsegmented_collector_jails(self) -> None:
        self.fake_fs.upload_from_string(
            path=self.fake_config_path,
            contents=PAUSED_REGION_CLOUD_SQL_CONFIG_YAML,
            content_type="text/yaml",
        )
        config = CloudSqlToBQConfig.for_schema_type(SchemaType.JAILS)
        collector = UnsegmentedSchemaFederatedBigQueryViewCollector(config)
        builders = collector.collect_view_builders()
        self.assertEqual(
            len(JailsBase.metadata.sorted_tables),
            len(builders),
        )
        view_addresses = set()
        materialized_addresses = set()
        for builder in builders:
            view = builder.build()
            view_addresses.add(view.address)
            if not view.materialized_address:
                raise ValueError(
                    f"Materialized address None for view [{view.address}]")
            materialized_addresses.add(view.materialized_address)

            if view.view_id == "person":
                # Check that we explicitly select columns
                self.assertTrue("person.birthdate" in view.view_query)
                # ... but not excluded ones
                self.assertTrue("full_name" not in view.view_query)
                self.assertTrue(
                    "birthdate_inferred_from_age" not in view.view_query)

        self.assertEqual({"jails_cloudsql_connection"},
                         {a.dataset_id
                          for a in view_addresses})
        self.assertEqual({"census_regional"},
                         {a.dataset_id
                          for a in materialized_addresses})
        self.assertEqual(
            {t.name
             for t in JailsBase.metadata.sorted_tables},
            {a.table_id
             for a in materialized_addresses},
        )

        # No addresses should clobber each other
        self.assertEqual(len(view_addresses), len(builders))
        self.assertEqual(len(materialized_addresses), len(builders))
        self.assertEqual(set(),
                         view_addresses.intersection(materialized_addresses))
Esempio n. 27
0
    def test_dataset_id(self) -> None:
        """Make sure dataset_id is defined correctly.

        Checks that it is a string, checks that it has characters,
        and checks that those characters are letters, numbers, or _.
        """
        for schema_type in self.enabled_schema_types:
            config = CloudSqlToBQConfig.for_schema_type(schema_type)
            assert config is not None
            allowed_characters = set(string.ascii_letters + string.digits +
                                     "_")

            self.assertIsInstance(config.dataset_id, str)

            for char in config.dataset_id:
                self.assertIn(char, allowed_characters)
Esempio n. 28
0
def create_all_jails_bq_refresh_tasks() -> Tuple[str, int]:
    """Creates an export task for each table to be exported.

    A task is created for each table defined in the JailsBase schema.

    Re-creates all tasks if any task fails to be created.
    """
    logging.info("Beginning BQ export for jails schema tables.")

    task_manager = BQRefreshCloudTaskManager()

    cloud_sql_to_bq_config = CloudSqlToBQConfig.for_schema_type(
        SchemaType.JAILS)
    for table in cloud_sql_to_bq_config.get_tables_to_export():
        task_manager.create_refresh_bq_table_task(table.name, SchemaType.JAILS)
    return ('', HTTPStatus.OK)
Esempio n. 29
0
    def test_get_stale_bq_rows_for_excluded_regions_query_builder(
            self) -> None:
        """Given a table name, assert that it returns a query builder that filters for rows
        excluded from the export query"""
        config = CloudSqlToBQConfig.for_schema_type(SchemaType.STATE)
        assert config is not None
        config.region_codes_to_exclude = ["US_VA", "us_id", "US_hi"]
        for table in config.get_tables_to_export():
            if is_association_table(table.name):
                # This is tested in the CloudSqlSchemaTableRegionFilteredQueryBuilder class
                continue
            filter_clause = "WHERE state_code IN ('US_VA','US_ID','US_HI')"
            query_builder = config.get_stale_bq_rows_for_excluded_regions_query_builder(
                table.name)

            self.assertIsInstance(
                query_builder, BigQuerySchemaTableRegionFilteredQueryBuilder)
            self.assertIn(filter_clause, query_builder.full_query())
Esempio n. 30
0
    def test_state_segmented_collector_paused_regions(self) -> None:
        self.fake_fs.upload_from_string(
            path=self.fake_config_path,
            contents=PAUSED_REGION_CLOUD_SQL_CONFIG_YAML,
            content_type="text/yaml",
        )
        config = CloudSqlToBQConfig.for_schema_type(SchemaType.OPERATIONS)
        collector = StateSegmentedSchemaFederatedBigQueryViewCollector(config)
        builders = collector.collect_view_builders()
        direct_ingest_states = get_existing_direct_ingest_states()
        num_schema_tables = len(OperationsBase.metadata.sorted_tables)
        num_paused_regions = 1
        self.assertEqual(
            len(direct_ingest_states) * num_schema_tables -
            num_paused_regions * num_schema_tables,
            len(builders),
        )
        view_addresses = set()
        materialized_addresses = set()
        for builder in builders:
            view = builder.build()
            view_addresses.add(view.address)
            if not view.materialized_address:
                raise ValueError(
                    f"Materialized address None for view [{view.address}]")
            materialized_addresses.add(view.materialized_address)

        self.assertEqual({"operations_cloudsql_connection"},
                         {a.dataset_id
                          for a in view_addresses})
        self.assertNotIn("us_nd_operations_regional",
                         {a.dataset_id
                          for a in materialized_addresses})
        self.assertEqual(
            {t.name
             for t in OperationsBase.metadata.sorted_tables},
            {a.table_id
             for a in materialized_addresses},
        )
        # No addresses should clobber each other
        self.assertEqual(len(view_addresses), len(builders))
        self.assertEqual(len(materialized_addresses), len(builders))
        self.assertEqual(set(),
                         view_addresses.intersection(materialized_addresses))