def main( sandbox_dataset_prefix: str, schema_type: SchemaType, direct_ingest_instance: Optional[DirectIngestInstance], ) -> None: """Defines the main function responsible for moving data from Postgres to BQ.""" logging.info("Prefixing all output datasets with [%s_].", known_args.sandbox_dataset_prefix) fake_gcs = FakeGCSFileSystem() # We mock the export config to a version that does not have any paused regions. with mock.patch( f"{cloud_sql_to_bq_refresh_config.__name__}.GcsfsFactory.build", return_value=fake_gcs, ): fake_gcs.upload_from_string( path=CloudSqlToBQConfig.default_config_path(), contents=STANDARD_YAML_CONTENTS, content_type="text/yaml", ) federated_bq_schema_refresh( schema_type=schema_type, direct_ingest_instance=direct_ingest_instance, dataset_override_prefix=sandbox_dataset_prefix, ) config = CloudSqlToBQConfig.for_schema_type(schema_type) final_destination_dataset = config.unioned_multi_region_dataset( dataset_override_prefix=sandbox_dataset_prefix) logging.info("Load complete. Data loaded to dataset [%s].", final_destination_dataset)
def test_for_schema_type_returns_instance(self) -> None: for schema_type in self.schema_types: if not CloudSqlToBQConfig.is_valid_schema_type(schema_type): with self.assertRaises(ValueError): _ = CloudSqlToBQConfig.for_schema_type(schema_type) else: config = CloudSqlToBQConfig.for_schema_type(schema_type) self.assertIsInstance(config, CloudSqlToBQConfig)
def _federated_bq_regional_dataset_refresh( config: CloudSqlToBQConfig, dataset_override_prefix: Optional[str] = None, ) -> None: """Queries data in the appropriate CloudSQL instance for the given schema / conifg and loads it into a single, unified dataset **in the same** region as the CloudSQL instance. In the process, creates / updates views that provide direct federated connections to the CloudSQL instance and intermediate state-segmented datasets (where appropriate). Example resulting datasets (OPERATIONS schema): operations_cloudsql_connection <-- Federated views us_xx_operations_regional <-- Materialized data from most recent export for state us_yy_operations_regional operations_regional <-- Materialized data from most recent export for each state """ if config.is_state_segmented_refresh_schema(): collector: BigQueryViewCollector[ FederatedCloudSQLTableBigQueryViewBuilder] = StateSegmentedSchemaFederatedBigQueryViewCollector( config) else: collector = UnsegmentedSchemaFederatedBigQueryViewCollector(config) view_builders = collector.collect_view_builders() # TODO(#7285): Migrate Justice Counts connection to be in same region as instance if config.schema_type == SchemaType.JUSTICE_COUNTS: bq_region_override = None else: bq_region_override = SQLAlchemyEngineManager.get_cloudsql_instance_region( config.schema_type) dataset_overrides = None if dataset_override_prefix: dataset_overrides = dataset_overrides_for_view_builders( view_dataset_override_prefix=dataset_override_prefix, view_builders=view_builders, ) create_managed_dataset_and_deploy_views_for_view_builders( view_source_table_datasets=set(), view_builders_to_update=view_builders, dataset_overrides=dataset_overrides, bq_region_override=bq_region_override, force_materialize=True, ) if config.is_state_segmented_refresh_schema(): _hydrate_unioned_regional_dataset_for_schema(config, bq_region_override, dataset_override_prefix)
def export_table(table_name: str, cloud_sql_to_bq_config: CloudSqlToBQConfig) -> bool: """Export a Cloud SQL table to a CSV file on GCS. Given a table name and export_query, retrieve the export URI from cloud_sql_to_bq_config, then execute the export operation and wait until it completes. Args: table_name: Table to export. cloud_sql_to_bq_config: The export config class for the table's SchemaType. Returns: True if operation succeeded without errors, False if not. """ schema_type = cloud_sql_to_bq_config.schema_type export_query = cloud_sql_to_bq_config.get_table_export_query(table_name) export_uri = cloud_sql_to_bq_config.get_gcs_export_uri_for_table(table_name) export_context = create_export_context(schema_type, export_uri, export_query) project_id = metadata.project_id() instance_id = SQLAlchemyEngineManager.get_stripped_cloudsql_instance_id(schema_type) export_request = ( sqladmin_client() .instances() .export(project=project_id, instance=instance_id, body=export_context) ) logging.info("GCS URI [%s] in project [%s]", export_uri, project_id) logging.info("Starting export: [%s]", str(export_request.to_json())) try: response = export_request.execute() except googleapiclient.errors.HttpError: logging.exception("Failed to export table [%s]", table_name) return False # We need to block until the operation is done because # the Cloud SQL API only supports one operation at a time. operation_id = response["name"] logging.info( "Waiting for export operation [%s] to complete for table [%s] " "in database [%s] in project [%s]", operation_id, table_name, instance_id, project_id, ) operation_success = wait_until_operation_finished(operation_id) return operation_success
def _copy_regional_dataset_to_multi_region( config: CloudSqlToBQConfig, dataset_override_prefix: Optional[str]) -> None: """Copies the unioned regional dataset for a schema to the multi-region dataset that contains the same data. Backs up the multi-region dataset before performing the copy. This backup dataset will get cleaned up if the copy succeeds, but otherwise will stick around for 1 week before tables expire. """ bq_client = BigQueryClientImpl() source_dataset_id = config.unioned_regional_dataset( dataset_override_prefix) destination_dataset_id = config.unioned_multi_region_dataset( dataset_override_prefix) destination_dataset = bq_client.dataset_ref_for_id(destination_dataset_id) backup_dataset = bq_client.backup_dataset_tables_if_dataset_exists( destination_dataset_id) try: if bq_client.dataset_exists(destination_dataset): tables = bq_client.list_tables(destination_dataset_id) for table in tables: bq_client.delete_table(table.dataset_id, table.table_id) bq_client.create_dataset_if_necessary( destination_dataset, default_table_expiration_ms=TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS if dataset_override_prefix else None, ) # Copy into the canonical unioned source datasets in the US multi-region bq_client.copy_dataset_tables_across_regions( source_dataset_id=source_dataset_id, destination_dataset_id=destination_dataset_id, ) except Exception as e: logging.info( "Failed to flash [%s] to [%s] - contents backup can be found at [%s]", source_dataset_id, destination_dataset_id, backup_dataset.dataset_id if backup_dataset else "NO BACKUP", ) raise e if backup_dataset: bq_client.delete_dataset(backup_dataset, delete_contents=True, not_found_ok=True)
def refresh_bq_table() -> Tuple[str, int]: """Worker function to handle BQ export task requests. Form data must be a bytes-encoded JSON object with parameters listed below. URL Parameters: table_name: Table to export then import. Table must be defined in one of the base schema types. """ json_data = request.get_data(as_text=True) data = json.loads(json_data) table_name = data['table_name'] schema_type_str = data['schema_type'] try: schema_type = SchemaType(schema_type_str) except ValueError: return (f'Unknown schema type [{schema_type_str}]', HTTPStatus.BAD_REQUEST) bq_client = BigQueryClientImpl() cloud_sql_to_bq_config = CloudSqlToBQConfig.for_schema_type(schema_type) logging.info("Starting BQ export task for table: %s", table_name) export_table_then_load_table(bq_client, table_name, cloud_sql_to_bq_config) return ('', HTTPStatus.OK)
def test_incorrect_direct_ingest_instance_raises(self) -> None: for schema_type in self.enabled_schema_types: if schema_type != SchemaType.STATE: with self.assertRaises(ValueError): _ = CloudSqlToBQConfig.for_schema_type( schema_type, DirectIngestInstance.PRIMARY )
def setUp(self) -> None: self.schema_types: List[SchemaType] = list(SchemaType) self.enabled_schema_types = [ schema_type for schema_type in self.schema_types if CloudSqlToBQConfig.is_valid_schema_type(schema_type) ] self.mock_project_id = "fake-recidiviz-project" self.metadata_patcher = mock.patch( "recidiviz.persistence.database.bq_refresh.cloud_sql_to_bq_refresh_config.metadata" ) self.mock_metadata = self.metadata_patcher.start() self.mock_metadata.project_id.return_value = self.mock_project_id self.gcs_factory_patcher = mock.patch( "recidiviz.persistence.database.bq_refresh.cloud_sql_to_bq_refresh_config.GcsfsFactory.build" ) self.fake_gcs = FakeGCSFileSystem() self.gcs_factory_patcher.start().return_value = self.fake_gcs self.set_config_yaml( """ region_codes_to_exclude: - US_ND state_history_tables_to_include: - state_person_history county_columns_to_exclude: person: - full_name - birthdate_inferred_from_age """ )
def test_get_tables_to_export(self) -> None: """Assertions for the method get_tables_to_export 1. Assert that it returns a list of type sqlalchemy.Table 2. For the StateBase schema, assert that included history tables are included 3. For the StateBase schema, assert that other history tables are excluded """ for schema_type in self.enabled_schema_types: config = CloudSqlToBQConfig.for_schema_type(schema_type) assert config is not None tables_to_export = config.get_tables_to_export() self.assertIsInstance(tables_to_export, List) for table in tables_to_export: self.assertIsInstance(table, sqlalchemy.Table) if schema_type == SchemaType.STATE: history_tables_to_include = config.history_tables_to_include for history_table in history_tables_to_include: table_names = list(map(lambda t: t.name, tables_to_export)) self.assertIn(history_table, table_names) for table in config.sorted_tables: if ("history" in table.name and table.name not in config.history_tables_to_include): self.assertNotIn(table, tables_to_export)
def test_build_unioned_segments_view_with_dataset_overrides(self) -> None: config = CloudSqlToBQConfig.for_schema_type(SchemaType.STATE) view_builder = UnionedStateSegmentsViewBuilder( config=config, table=StateBase.metadata.tables["state_person_external_id"], state_codes=[StateCode.US_XX, StateCode.US_WW], ) view = view_builder.build( dataset_overrides={ "state_regional": "my_prefix_state_regional", "us_xx_state_regional": "my_prefix_us_xx_state_regional", "us_ww_state_regional": "my_prefix_us_ww_state_regional", } ) expected_query = ( "SELECT state_person_external_id.external_id,state_person_external_id.state_code," "state_person_external_id.id_type,state_person_external_id.person_external_id_id," "state_person_external_id.person_id FROM " "`recidiviz-staging.my_prefix_us_xx_state_regional.state_person_external_id` state_person_external_id\n" "UNION ALL\n" "SELECT state_person_external_id.external_id,state_person_external_id.state_code," "state_person_external_id.id_type,state_person_external_id.person_external_id_id," "state_person_external_id.person_id FROM " "`recidiviz-staging.my_prefix_us_ww_state_regional.state_person_external_id` state_person_external_id" ) self.assertEqual(expected_query, view.view_query) self.assertEqual( BigQueryAddress( dataset_id="my_prefix_state_regional", table_id="state_person_external_id", ), view.materialized_address, )
def test_yaml_config_reads_correctly_JAILS( self, schema: SchemaType, regions_to_exclude: List[str], columns_to_exclude: Dict[str, List[str]], history_tables_to_include: List[str], ) -> None: config = CloudSqlToBQConfig.for_schema_type(schema) assert config is not None self.assertListsDistinctAndEqual( regions_to_exclude, config.region_codes_to_exclude, msg_prefix="Region codes", ) self.assertListsDistinctAndEqual( history_tables_to_include, config.history_tables_to_include, msg_prefix="History tables", ) self.assertListsDistinctAndEqual( list(columns_to_exclude.keys()), list(config.columns_to_exclude.keys()), msg_prefix="Excluded columns keys", ) for k in columns_to_exclude.keys(): self.assertListsDistinctAndEqual( columns_to_exclude[k], config.columns_to_exclude[k], msg_prefix=f"Excluded columsn for {k}", )
def create_all_bq_refresh_tasks_for_schema(schema_arg: str) -> None: """Creates an export task for each table to be exported. A task is created for each table defined in the schema. Re-creates all tasks if any task fails to be created. """ try: schema_type = SchemaType(schema_arg.upper()) except ValueError: return logging.info("Beginning BQ export for %s schema tables.", schema_type.value) task_manager = BQRefreshCloudTaskManager() cloud_sql_to_bq_config = CloudSqlToBQConfig.for_schema_type(schema_type) if cloud_sql_to_bq_config is None: logging.info("Cloud SQL to BQ is disabled for: %s", schema_type) return for table in cloud_sql_to_bq_config.get_tables_to_export(): task_manager.create_refresh_bq_table_task(table.name, schema_type) if schema_type is SchemaType.STATE: pub_sub_topic = "v1.calculator.trigger_daily_pipelines" pub_sub_message = "State export to BQ complete" else: pub_sub_topic = "" pub_sub_message = "" task_manager.create_bq_refresh_monitor_task(schema_type.value, pub_sub_topic, pub_sub_message)
def test_for_schema_type_returns_instance(self) -> None: for schema_type in self.schema_types: config = CloudSqlToBQConfig.for_schema_type(schema_type) if schema_type in self.disabled_schema_types: self.assertIsNone(config) else: self.assertIsInstance(config, CloudSqlToBQConfig)
def update_data_freshness_results(self) -> None: """Refreshes information in the metadata store about freshness of ingested data for all states.""" bq_export_config = CloudSqlToBQConfig.for_schema_type( SchemaType.STATE, yaml_path=GcsfsFilePath.from_absolute_path( f"gs://{self.project_id}-configs/cloud_sql_to_bq_config.yaml" ), ) if bq_export_config is None: raise ValueError("STATE CloudSqlToBQConfig unexpectedly None.") regions_paused = bq_export_config.region_codes_to_exclude latest_upper_bounds_path = GcsfsFilePath.from_absolute_path( f"gs://{self.project_id}-ingest-metadata/ingest_metadata_latest_ingested_upper_bounds.json" ) latest_upper_bounds_json = self.gcs_fs.download_as_string( latest_upper_bounds_path ) latest_upper_bounds = [] for line in latest_upper_bounds_json.splitlines(): line = line.strip() if not line: continue struct = json.loads(line) latest_upper_bounds.append( { "state": struct["state_code"], "date": struct.get("processed_date"), "ingestPaused": struct["state_code"] in regions_paused, } ) self.data_freshness_results = latest_upper_bounds
def load_table_from_gcs_and_wait( big_query_client: BigQueryClient, table_name: str, cloud_sql_to_bq_config: CloudSqlToBQConfig, destination_table_id: str, ) -> None: """Loads a table from CSV data in GCS to BigQuery. Given a table name and a destination_table_id, retrieve the export URI and schema from cloud_sql_to_bq_config, then load the table into the destination_table_id. This starts the job, but does not wait until it completes. Tables are created if they do not exist, and overwritten if they do exist. Because we are using bigquery.WriteDisposition.WRITE_TRUNCATE, the table's data will be completely wiped and overwritten with the contents of the CSV. Args: big_query_client: A BigQueryClient. table_name: Table to import. Table must be defined in the base schema. cloud_sql_to_bq_config: Export config class for a specific SchemaType. destination_table_id: Optional destination table name. If none is given, the provided table name is used. Returns: If the table load succeeds, returns None. If it fails it raises a ValueError. """ uri = cloud_sql_to_bq_config.get_gcs_export_uri_for_table(table_name) logging.info("GCS URI [%s] in project [%s]", uri, metadata.project_id()) bq_schema = cloud_sql_to_bq_config.get_bq_schema_for_table(table_name) dataset_ref = cloud_sql_to_bq_config.get_dataset_ref(big_query_client) load_job = big_query_client.load_table_from_cloud_storage_async( source_uri=uri, destination_dataset_ref=dataset_ref, destination_table_id=destination_table_id, destination_table_schema=bq_schema, ) table_load_success = wait_for_table_load(big_query_client, load_job) if not table_load_success: raise ValueError( f"Copy from cloud storage to temp table failed. Skipping refresh for BQ table [{table_name}]" )
def refresh_bq_schema(schema_arg: str) -> Tuple[str, HTTPStatus]: """Performs a full refresh of BigQuery data for a given schema, pulling data from the appropriate CloudSQL Postgres instance. On completion, triggers Dataflow pipelines (when necessary), releases the refresh lock and restarts any paused ingest work. """ try: schema_type = SchemaType(schema_arg.upper()) except ValueError: return ( f"Unexpected value for schema_arg: [{schema_arg}]", HTTPStatus.BAD_REQUEST, ) if not CloudSqlToBQConfig.is_valid_schema_type(schema_type): return ( f"Unsupported schema type: [{schema_type}]", HTTPStatus.BAD_REQUEST, ) lock_manager = CloudSqlToBQLockManager() try: can_proceed = lock_manager.can_proceed(schema_type) except GCSPseudoLockDoesNotExist as e: logging.exception(e) return ( f"Expected lock for [{schema_arg}] BQ refresh to already exist.", HTTPStatus.EXPECTATION_FAILED, ) if not can_proceed: return ( f"Expected to be able to proceed with refresh before this endpoint was " f"called for [{schema_arg}].", HTTPStatus.EXPECTATION_FAILED, ) federated_bq_schema_refresh(schema_type=schema_type) # Publish a message to the Pub/Sub topic once state BQ export is complete if schema_type is SchemaType.STATE: pubsub_helper.publish_message_to_topic( message="State export to BQ complete", topic="v1.calculator.trigger_daily_pipelines", ) # Unlock export lock when all BQ exports complete lock_manager = CloudSqlToBQLockManager() lock_manager.release_lock(schema_type) logging.info( "Done running refresh for [%s], unlocking Postgres to BigQuery export", schema_type.value, ) # Kick scheduler to restart ingest kick_all_schedulers() return "", HTTPStatus.OK
def test_collect_do_not_crash(self) -> None: self.fake_fs.upload_from_string( path=self.fake_config_path, contents=PAUSED_REGION_CLOUD_SQL_CONFIG_YAML, content_type="text/yaml", ) for schema_type in SchemaType: if not CloudSqlToBQConfig.is_valid_schema_type(schema_type): continue config = CloudSqlToBQConfig.for_schema_type(schema_type) if config.is_state_segmented_refresh_schema(): _ = StateSegmentedSchemaFederatedBigQueryViewCollector( config).collect_view_builders() else: _ = UnsegmentedSchemaFederatedBigQueryViewCollector( config).collect_view_builders()
def test_unioned_segments_view_unsegmented_config_crashes(self) -> None: config = CloudSqlToBQConfig.for_schema_type(SchemaType.JAILS) with self.assertRaises(ValueError) as e: _ = UnionedStateSegmentsViewBuilder( config=config, table=JailsBase.metadata.sorted_tables[0], state_codes=[StateCode.US_XX], ) self.assertEqual(str(e.exception), "Unexpected schema type [JAILS]")
def test_get_bq_schema_for_table_region_code_in_schema(self) -> None: """Assert that the region code is included in the schema for association tables in the State schema.""" association_table_name = ( "state_supervision_period_program_assignment_association") config = CloudSqlToBQConfig.for_schema_type(SchemaType.STATE) assert config is not None region_code_col = "state_code" schema = config.get_bq_schema_for_table(association_table_name) self.assertIn(region_code_col, [schema_field.name for schema_field in schema])
def delete_temp_table_if_exists( big_query_client: BigQueryClient, temp_table_name: str, cloud_sql_to_bq_config: CloudSqlToBQConfig) -> None: dataset_ref = cloud_sql_to_bq_config.get_dataset_ref(big_query_client) if not big_query_client.table_exists(dataset_ref=dataset_ref, table_id=temp_table_name): logging.info('Delete temp table failed, table [%s] does not exist.', temp_table_name) return big_query_client.delete_table(dataset_id=cloud_sql_to_bq_config.dataset_id, table_id=temp_table_name) logging.info('Deleted temporary table [%s]', temp_table_name)
def test_get_gcs_export_uri_for_table(self) -> None: """Test that get_gcs_export_uri_for_table generates a GCS URI with the correct project ID and table name. """ config = CloudSqlToBQConfig.for_schema_type(SchemaType.JAILS) assert config is not None fake_table = "my_fake_table" bucket = "{}-dbexport".format(self.mock_project_id) gcs_export_uri = "gs://{bucket}/{table_name}.csv".format( bucket=bucket, table_name=fake_table) self.assertEqual(gcs_export_uri, config.get_gcs_export_uri_for_table(fake_table))
def test_get_stale_bq_rows_for_excluded_regions_query_builder_jails_schema( self, ) -> None: """Given a JAILS schema, a table name and None for region_codes_to_exclude, assert that it returns a query builder that returns no rows""" filter_clause = "WHERE FALSE" config = CloudSqlToBQConfig.for_schema_type(SchemaType.JAILS) assert config is not None for table in config.get_tables_to_export(): query_builder = config.get_stale_bq_rows_for_excluded_regions_query_builder( table.name) self.assertIsInstance( query_builder, BigQuerySchemaTableRegionFilteredQueryBuilder) self.assertEqual(filter_clause, query_builder.filter_clause())
def test_unioned_multi_region_dataset(self) -> None: for schema_type in self.enabled_schema_types: config = CloudSqlToBQConfig.for_schema_type(schema_type) dataset = config.unioned_multi_region_dataset(dataset_override_prefix=None) self.assertFalse(dataset.endswith("regional")) self.assertTrue(dataset in VIEW_SOURCE_TABLE_DATASETS) dataset_with_prefix = config.unioned_multi_region_dataset( dataset_override_prefix="prefix" ) self.assertTrue(dataset_with_prefix.startswith("prefix_")) self.assertFalse(dataset_with_prefix.endswith("regional")) self.assertTrue(dataset_with_prefix not in VIEW_SOURCE_TABLE_DATASETS)
def test_excluded_columns(self) -> None: for schema_type in self.enabled_schema_types: config = CloudSqlToBQConfig.for_schema_type(schema_type) for table in config.sorted_tables: # pylint: disable=protected-access columns = config._get_table_columns_to_export(table) for column in columns: self.assertIsInstance(column, str) self.assertTrue( column not in config.columns_to_exclude.get(table.name, []), msg=f"Column {column} should not be included. It is found in " f"COUNTY_COLUMNS_TO_EXCLUDE` for this table {table.name}.", )
def __init__( self, *, config: CloudSqlToBQConfig, table: Table, state_codes: List[StateCode], ): if not config.is_state_segmented_refresh_schema(): raise ValueError( f"Unexpected schema type [{config.schema_type.name}]") self.config = config self.table = table self.state_codes = state_codes # Dataset prefixing will ge handled automatically by view building logic self.dataset_id = config.unioned_regional_dataset( dataset_override_prefix=None) self.view_id = f"{table.name}_view" self.materialized_address_override = BigQueryAddress( dataset_id=self.dataset_id, table_id=table.name, )
def test_unsegmented_collector_jails(self) -> None: self.fake_fs.upload_from_string( path=self.fake_config_path, contents=PAUSED_REGION_CLOUD_SQL_CONFIG_YAML, content_type="text/yaml", ) config = CloudSqlToBQConfig.for_schema_type(SchemaType.JAILS) collector = UnsegmentedSchemaFederatedBigQueryViewCollector(config) builders = collector.collect_view_builders() self.assertEqual( len(JailsBase.metadata.sorted_tables), len(builders), ) view_addresses = set() materialized_addresses = set() for builder in builders: view = builder.build() view_addresses.add(view.address) if not view.materialized_address: raise ValueError( f"Materialized address None for view [{view.address}]") materialized_addresses.add(view.materialized_address) if view.view_id == "person": # Check that we explicitly select columns self.assertTrue("person.birthdate" in view.view_query) # ... but not excluded ones self.assertTrue("full_name" not in view.view_query) self.assertTrue( "birthdate_inferred_from_age" not in view.view_query) self.assertEqual({"jails_cloudsql_connection"}, {a.dataset_id for a in view_addresses}) self.assertEqual({"census_regional"}, {a.dataset_id for a in materialized_addresses}) self.assertEqual( {t.name for t in JailsBase.metadata.sorted_tables}, {a.table_id for a in materialized_addresses}, ) # No addresses should clobber each other self.assertEqual(len(view_addresses), len(builders)) self.assertEqual(len(materialized_addresses), len(builders)) self.assertEqual(set(), view_addresses.intersection(materialized_addresses))
def test_dataset_id(self) -> None: """Make sure dataset_id is defined correctly. Checks that it is a string, checks that it has characters, and checks that those characters are letters, numbers, or _. """ for schema_type in self.enabled_schema_types: config = CloudSqlToBQConfig.for_schema_type(schema_type) assert config is not None allowed_characters = set(string.ascii_letters + string.digits + "_") self.assertIsInstance(config.dataset_id, str) for char in config.dataset_id: self.assertIn(char, allowed_characters)
def create_all_jails_bq_refresh_tasks() -> Tuple[str, int]: """Creates an export task for each table to be exported. A task is created for each table defined in the JailsBase schema. Re-creates all tasks if any task fails to be created. """ logging.info("Beginning BQ export for jails schema tables.") task_manager = BQRefreshCloudTaskManager() cloud_sql_to_bq_config = CloudSqlToBQConfig.for_schema_type( SchemaType.JAILS) for table in cloud_sql_to_bq_config.get_tables_to_export(): task_manager.create_refresh_bq_table_task(table.name, SchemaType.JAILS) return ('', HTTPStatus.OK)
def test_get_stale_bq_rows_for_excluded_regions_query_builder( self) -> None: """Given a table name, assert that it returns a query builder that filters for rows excluded from the export query""" config = CloudSqlToBQConfig.for_schema_type(SchemaType.STATE) assert config is not None config.region_codes_to_exclude = ["US_VA", "us_id", "US_hi"] for table in config.get_tables_to_export(): if is_association_table(table.name): # This is tested in the CloudSqlSchemaTableRegionFilteredQueryBuilder class continue filter_clause = "WHERE state_code IN ('US_VA','US_ID','US_HI')" query_builder = config.get_stale_bq_rows_for_excluded_regions_query_builder( table.name) self.assertIsInstance( query_builder, BigQuerySchemaTableRegionFilteredQueryBuilder) self.assertIn(filter_clause, query_builder.full_query())
def test_state_segmented_collector_paused_regions(self) -> None: self.fake_fs.upload_from_string( path=self.fake_config_path, contents=PAUSED_REGION_CLOUD_SQL_CONFIG_YAML, content_type="text/yaml", ) config = CloudSqlToBQConfig.for_schema_type(SchemaType.OPERATIONS) collector = StateSegmentedSchemaFederatedBigQueryViewCollector(config) builders = collector.collect_view_builders() direct_ingest_states = get_existing_direct_ingest_states() num_schema_tables = len(OperationsBase.metadata.sorted_tables) num_paused_regions = 1 self.assertEqual( len(direct_ingest_states) * num_schema_tables - num_paused_regions * num_schema_tables, len(builders), ) view_addresses = set() materialized_addresses = set() for builder in builders: view = builder.build() view_addresses.add(view.address) if not view.materialized_address: raise ValueError( f"Materialized address None for view [{view.address}]") materialized_addresses.add(view.materialized_address) self.assertEqual({"operations_cloudsql_connection"}, {a.dataset_id for a in view_addresses}) self.assertNotIn("us_nd_operations_regional", {a.dataset_id for a in materialized_addresses}) self.assertEqual( {t.name for t in OperationsBase.metadata.sorted_tables}, {a.table_id for a in materialized_addresses}, ) # No addresses should clobber each other self.assertEqual(len(view_addresses), len(builders)) self.assertEqual(len(materialized_addresses), len(builders)) self.assertEqual(set(), view_addresses.intersection(materialized_addresses))